1997 lines
81 KiB
Python
1997 lines
81 KiB
Python
#!/usr/bin/env python3
|
|
# /// script
|
|
# requires-python = ">=3.8"
|
|
# dependencies = []
|
|
# ///
|
|
"""MinerU CLI — parse PDF / Office / image files into clean Markdown.
|
|
|
|
Zero-dependency (Python standard library only) and AI-Native. The tool picks the
|
|
right MinerU backend automatically:
|
|
|
|
* no token -> Agent API (free, no login; <=10 MB, <=20 pages)
|
|
* token + small file -> Agent API (fast & free, auto-escalates on limits)
|
|
* token + big/batch/fmt -> Standard v4 (<=200 MB, <=200 pages, docx/html/latex)
|
|
|
|
Token: https://mineru.net/apiManage/token
|
|
Docs: https://mineru.net/apiManage/docs
|
|
|
|
Examples
|
|
--------
|
|
# Zero-config single file (no token needed)
|
|
python3 mineru.py paper.pdf
|
|
|
|
# Pipe the Markdown straight back to an agent
|
|
python3 mineru.py paper.pdf --stdout
|
|
|
|
# Batch a directory with a token (Standard API, parallel)
|
|
export MINERU_TOKEN=...
|
|
python3 mineru.py ./pdfs/ --output ./out/ --workers 8 --resume
|
|
|
|
# Parse a remote URL and also export DOCX
|
|
python3 mineru.py https://example.com/doc.pdf --format docx
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import http.client
|
|
import json
|
|
import os
|
|
import random
|
|
import re
|
|
import sys
|
|
import tempfile
|
|
import threading
|
|
import time
|
|
import urllib.error
|
|
import urllib.parse
|
|
import urllib.request
|
|
import zipfile
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from dataclasses import dataclass, field
|
|
from io import BytesIO
|
|
from pathlib import Path, PurePosixPath
|
|
from typing import Optional
|
|
|
|
__version__ = "3.3.1"
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Constants (kept in sync with https://mineru.net/apiManage/docs)
|
|
# --------------------------------------------------------------------------- #
|
|
STANDARD_API = "https://mineru.net/api/v4"
|
|
AGENT_API = "https://mineru.net/api/v1/agent"
|
|
|
|
AGENT_MAX_BYTES = 10 * 1024 * 1024 # 10 MB
|
|
AGENT_MAX_PAGES = 20
|
|
STANDARD_MAX_BYTES = 200 * 1024 * 1024 # 200 MB
|
|
STANDARD_MAX_PAGES = 200
|
|
BATCH_MAX_FILES = 50 # per Standard batch request
|
|
FREE_DAILY_PAGES = 1000 # highest-priority quota / day
|
|
|
|
USER_AGENT = f"MinerU-Skill/{__version__}"
|
|
|
|
# Reliability tuning ---------------------------------------------------------- #
|
|
# Transient HTTP statuses worth a backed-off retry (never business code != 0).
|
|
RETRY_STATUSES = {408, 425, 429, 500, 502, 503, 504}
|
|
RETRY_MAX_ATTEMPTS = 4 # total tries per request (1 + 3 retries)
|
|
RETRY_BASE_DELAY = 0.5 # seconds; doubles each attempt
|
|
RETRY_MAX_DELAY = 20.0 # backoff ceiling
|
|
DEFAULT_POLL_INTERVAL = 2.0 # seconds between status polls
|
|
POLL_INTERVAL_CAP = 15.0 # adaptive backoff ceiling while polling
|
|
DEFAULT_WORKERS = 8 # decoupled submit/poll lifts the old thread-bound ceiling
|
|
# Cap for poll/submit network calls. A single stalled request must not wedge the
|
|
# single-threaded poll loop for the whole per-parse budget — that budget lives in
|
|
# job.deadline; the per-request socket timeout is bounded here. Downloads/uploads
|
|
# (large zips) keep the full timeout.
|
|
REQUEST_TIMEOUT_CAP = 30.0
|
|
|
|
# Business-layer API codes that are worth a bounded retry. Authentication,
|
|
# quota, and file-limit failures are intentionally absent: those need user action
|
|
# or a different input, so retrying only burns time/quota.
|
|
RETRYABLE_API_CODES = {-10001, -60001, -60007, -60009}
|
|
FATAL_API_CODES = {"A0202", "A0211", -60005, -60006, -60017, -60018, -60019}
|
|
# Daily-quota / retry-limit codes. Once any submit hits one of these the whole run
|
|
# is doomed for the day, so trip a circuit breaker and skip the remaining submits
|
|
# instead of firing N more doomed requests (FREE_DAILY_PAGES informs the message).
|
|
QUOTA_EXHAUSTED_CODES = {-60017, -60018, -60019}
|
|
|
|
# Input modalities MinerU understands, grouped so the CLI can report what it sees
|
|
# and so support stays single-sourced. The Agent API additionally rejects HTML.
|
|
MODALITY_SUFFIXES = {
|
|
"pdf": {".pdf"},
|
|
"image": {".png", ".jpg", ".jpeg", ".jp2", ".webp", ".gif", ".bmp"},
|
|
"word": {".doc", ".docx"},
|
|
"slides": {".ppt", ".pptx"},
|
|
"sheet": {".xls", ".xlsx"},
|
|
"html": {".html"},
|
|
}
|
|
SUPPORTED_SUFFIXES = {suf for group in MODALITY_SUFFIXES.values() for suf in group}
|
|
|
|
# Error code -> actionable hint. Mirrors the official docs error tables.
|
|
ERROR_HINTS = {
|
|
"A0202": "Invalid token — check it or create a new one at https://mineru.net/apiManage/token",
|
|
"A0211": "Token expired — create a new one at https://mineru.net/apiManage/token",
|
|
-500: "Parameter error — check request parameters and Content-Type",
|
|
-10001: "Service error — please retry later",
|
|
-10002: "Invalid request parameters",
|
|
-60001: "Failed to generate upload URL — retry later",
|
|
-60002: "Unsupported file format — use a correct file extension",
|
|
-60003: "Failed to read file — the file may be corrupted",
|
|
-60004: "Empty file — upload a valid file",
|
|
-60005: "File too large — Standard API max is 200 MB",
|
|
-60006: "Too many pages — Standard API max is 200 pages, split the file",
|
|
-60007: "Model service temporarily unavailable — retry later",
|
|
-60008: "File read timeout — ensure the URL is reachable",
|
|
-60009: "Task queue is full — retry later",
|
|
-60010: "Parse failed — retry later",
|
|
-60011: "Failed to get a valid file — ensure the file was uploaded",
|
|
-60012: "Task not found — check the task_id",
|
|
-60013: "No permission to access this task",
|
|
-60014: "Cannot delete a running task",
|
|
-60015: "File conversion failed — try converting to PDF first",
|
|
-60016: "Format conversion failed — try another export format",
|
|
-60017: "Retry limit reached — try again after a model upgrade",
|
|
-60018: "Daily parse quota reached — try again tomorrow",
|
|
-60019: "Insufficient HTML parse quota — try again tomorrow",
|
|
-60020: "File split failed — retry later",
|
|
-60021: "Failed to read page count — retry later",
|
|
-60022: "Web page read failed — possibly rate-limited, retry later",
|
|
# Agent (lightweight) API specific codes
|
|
-30001: "File exceeds Agent API 10 MB limit — set MINERU_TOKEN to use the Standard API",
|
|
-30002: "Agent API does not support this file type — use PDF/image/Doc/PPT/Excel",
|
|
-30003: "Pages exceed Agent API 20-page limit — set MINERU_TOKEN or pass --pages",
|
|
-30004: "Invalid request parameters — check required fields",
|
|
}
|
|
|
|
# Agent-API error codes that a Standard-API retry can recover from.
|
|
AGENT_ESCALATABLE = {-30001, -30003}
|
|
|
|
# Terminal/transient task states (Standard + Agent share most of these).
|
|
STATE_DONE = "done"
|
|
STATE_FAILED = "failed"
|
|
ACTIVE_STATES = {"pending", "running", "converting", "uploading", "waiting-file"}
|
|
|
|
|
|
class MinerUError(Exception):
|
|
"""Raised when the API returns a non-zero ``code`` or an unrecoverable error."""
|
|
|
|
def __init__(self, message: str, code=None):
|
|
super().__init__(message)
|
|
self.code = code
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Options / results
|
|
# --------------------------------------------------------------------------- #
|
|
@dataclass(frozen=True)
|
|
class ParseOptions:
|
|
model: str = "vlm"
|
|
language: str = "ch"
|
|
is_ocr: bool = False
|
|
enable_formula: bool = True
|
|
enable_table: bool = True
|
|
page_ranges: Optional[str] = None
|
|
extra_formats: tuple = ()
|
|
|
|
|
|
@dataclass
|
|
class ParseResult:
|
|
name: str
|
|
source: str
|
|
api: str = "agent"
|
|
modality: str = "unknown"
|
|
state: str = STATE_FAILED
|
|
output_dir: Optional[str] = None
|
|
markdown_path: Optional[str] = None
|
|
markdown: Optional[str] = None
|
|
task_id: Optional[str] = None
|
|
elapsed: Optional[float] = None
|
|
error: Optional[str] = None
|
|
sinks: list = field(default_factory=list)
|
|
chunks: Optional[list] = None
|
|
|
|
def to_status(self) -> dict:
|
|
"""Machine-readable status used by ``--json`` (omits the full markdown body)."""
|
|
status = {
|
|
"name": self.name,
|
|
"source": self.source,
|
|
"api": self.api,
|
|
"modality": self.modality,
|
|
"state": self.state,
|
|
"output_dir": self.output_dir,
|
|
"markdown_path": self.markdown_path,
|
|
"task_id": self.task_id,
|
|
"elapsed": self.elapsed,
|
|
"error": self.error,
|
|
"sinks": self.sinks,
|
|
}
|
|
if self.chunks is not None:
|
|
status["chunks"] = self.chunks
|
|
return status
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Pure helpers (heavily unit-tested)
|
|
# --------------------------------------------------------------------------- #
|
|
def is_url(value: str) -> bool:
|
|
return value.startswith("http://") or value.startswith("https://")
|
|
|
|
|
|
def safe_stem(source: str) -> str:
|
|
"""Derive a clean output folder name from a file path or URL."""
|
|
tail = source.split("?", 1)[0].rstrip("/")
|
|
name = tail.rsplit("/", 1)[-1] if is_url(source) else Path(source).name
|
|
stem = Path(name).stem or "document"
|
|
return stem
|
|
|
|
|
|
def unique_out_stems(sources) -> list:
|
|
"""On-disk folder/file stem per source, disambiguated ONLY on collision.
|
|
|
|
Distinct basenames keep their bare stem (the documented output-dir contract).
|
|
When two inputs share a basename (``a/report.pdf`` + ``b/report.pdf``) the
|
|
later ones get a ``-2``/``-3`` suffix so neither silently overwrites the other
|
|
on disk or via ``--resume``. Stable for a given input order, so resume keeps
|
|
matching across re-runs.
|
|
"""
|
|
out, seen = [], {}
|
|
for src in sources:
|
|
base = safe_stem(src)
|
|
n = seen.get(base, 0)
|
|
seen[base] = n + 1
|
|
out.append(base if n == 0 else f"{base}-{n + 1}")
|
|
return out
|
|
|
|
|
|
def safe_data_id(stem: str) -> str:
|
|
"""data_id allows [A-Za-z0-9_.-], <=128 chars."""
|
|
cleaned = "".join(c if (c.isalnum() or c in "_.-") else "-" for c in stem)
|
|
return cleaned[:128] or "document"
|
|
|
|
|
|
def suffix_of(source: str) -> str:
|
|
tail = source.split("?", 1)[0]
|
|
return Path(tail).suffix.lower()
|
|
|
|
|
|
def is_supported(source: str) -> bool:
|
|
return suffix_of(source) in SUPPORTED_SUFFIXES
|
|
|
|
|
|
def is_html(source: str) -> bool:
|
|
return suffix_of(source) == ".html"
|
|
|
|
|
|
def detect_modality(source: str) -> str:
|
|
"""Classify the input modality (pdf/image/word/slides/sheet/html/url/unknown)."""
|
|
suffix = suffix_of(source)
|
|
if not suffix and is_url(source):
|
|
return "url"
|
|
for modality, suffixes in MODALITY_SUFFIXES.items():
|
|
if suffix in suffixes:
|
|
return modality
|
|
return "unknown"
|
|
|
|
|
|
def to_agent_page_range(page_ranges: Optional[str]) -> Optional[str]:
|
|
"""Agent API only supports ``from-to`` or a single page (no commas)."""
|
|
if not page_ranges:
|
|
return None
|
|
first = page_ranges.split(",", 1)[0].strip()
|
|
return first or None
|
|
|
|
|
|
def error_hint(code) -> str:
|
|
"""Human-friendly hint for an API error code (falls back to the raw code)."""
|
|
if code in ERROR_HINTS:
|
|
return ERROR_HINTS[code]
|
|
return f"API error (code {code})"
|
|
|
|
|
|
def _result_error(data: dict, default: str = "Parse failed") -> str:
|
|
"""Best available parse-task failure message, preserving documented codes."""
|
|
msg = data.get("err_msg") or data.get("msg")
|
|
code = data.get("err_code") or data.get("code")
|
|
if msg:
|
|
return msg
|
|
if code is not None:
|
|
return error_hint(code)
|
|
return default
|
|
|
|
|
|
def choose_api(
|
|
*,
|
|
token: Optional[str],
|
|
source: str,
|
|
size_bytes: Optional[int],
|
|
batch: bool,
|
|
extra_formats,
|
|
explicit: str = "auto",
|
|
) -> str:
|
|
"""Decide which backend to use. ``explicit`` of 'agent'/'standard' wins."""
|
|
if explicit in ("agent", "standard"):
|
|
return explicit
|
|
# HTML is Standard-only (MinerU-HTML model); Agent API rejects it.
|
|
if is_html(source):
|
|
return "standard"
|
|
if not token:
|
|
return "agent"
|
|
if batch or extra_formats:
|
|
return "standard"
|
|
if size_bytes is not None and size_bytes > AGENT_MAX_BYTES:
|
|
return "standard"
|
|
return "agent"
|
|
|
|
|
|
def _pdf_page_count_if_available(source: str) -> Optional[int]:
|
|
"""Best-effort local PDF page count.
|
|
|
|
The core stays zero-dependency, so this only runs when the optional ``pypdf``
|
|
module is installed. If it is unavailable or cannot read the PDF, callers
|
|
fall back to the API-side validation.
|
|
"""
|
|
if is_url(source) or suffix_of(source) != ".pdf":
|
|
return None
|
|
try:
|
|
import pypdf # type: ignore
|
|
except ImportError:
|
|
return None
|
|
try:
|
|
return len(pypdf.PdfReader(str(source)).pages)
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _precheck_limits(source: str, api_kind: str, opts: ParseOptions) -> Optional[str]:
|
|
"""Return a local limit error, or ``None`` when submission is allowed."""
|
|
if is_url(source):
|
|
return None
|
|
try:
|
|
size = os.path.getsize(source)
|
|
except OSError as exc:
|
|
return str(exc)
|
|
if size <= 0:
|
|
return "Empty file — upload a valid file"
|
|
if api_kind == "standard" and size > STANDARD_MAX_BYTES:
|
|
return ERROR_HINTS[-60005]
|
|
if api_kind == "agent" and size > AGENT_MAX_BYTES:
|
|
return ERROR_HINTS[-30001]
|
|
|
|
# A page range may intentionally select a capped subset, so avoid rejecting
|
|
# locally unless we know the full document must be sent.
|
|
if opts.page_ranges:
|
|
return None
|
|
pages = _pdf_page_count_if_available(source)
|
|
if pages is None:
|
|
return None
|
|
if api_kind == "standard" and pages > STANDARD_MAX_PAGES:
|
|
return ERROR_HINTS[-60006]
|
|
if api_kind == "agent" and pages > AGENT_MAX_PAGES:
|
|
return ERROR_HINTS[-30003]
|
|
return None
|
|
|
|
|
|
def _agent_page_limit_exceeded(source: str, opts: ParseOptions) -> bool:
|
|
if opts.page_ranges:
|
|
return False
|
|
pages = _pdf_page_count_if_available(source)
|
|
return pages is not None and pages > AGENT_MAX_PAGES
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# HTTP seam
|
|
# --------------------------------------------------------------------------- #
|
|
# Two layers:
|
|
# * ``_send_once`` — one keep-alive request (reuses a per-thread connection so
|
|
# the frequent poll traffic stops re-doing the TLS handshake).
|
|
# * ``_http`` — retry/backoff wrapper around ``_send_once``. This is the
|
|
# single place the unit tests monkeypatch, so the retry and
|
|
# keep-alive machinery is transparent to them; the retry path
|
|
# is exercised separately by patching ``_send_once``.
|
|
_conn_local = threading.local()
|
|
|
|
|
|
def _backoff_delay(attempt: int, retry_after=None) -> float:
|
|
"""Exponential backoff with jitter; honors a server ``Retry-After`` if given."""
|
|
if retry_after is not None:
|
|
try:
|
|
return min(float(retry_after), RETRY_MAX_DELAY)
|
|
except (TypeError, ValueError):
|
|
pass
|
|
ceiling = min(RETRY_MAX_DELAY, RETRY_BASE_DELAY * (2 ** attempt))
|
|
return ceiling * (0.5 + random.random() / 2) # jitter in [0.5x, 1.0x]
|
|
|
|
|
|
def _next_poll_interval(interval, *, progressed, base) -> float:
|
|
"""Adaptive poll backoff for ONE group: reset to ``base`` on progress, else
|
|
grow geometrically toward ``POLL_INTERVAL_CAP``. Kept per-group so a fast batch
|
|
cannot reset (or a stuck batch inflate) the polling cadence of an unrelated one."""
|
|
if progressed:
|
|
return base
|
|
return min(interval * 1.5, POLL_INTERVAL_CAP)
|
|
|
|
|
|
def _should_retry_status(status) -> bool:
|
|
return status in RETRY_STATUSES
|
|
|
|
|
|
def _is_retryable_api_error(exc: MinerUError) -> bool:
|
|
if exc.code in FATAL_API_CODES:
|
|
return False
|
|
return exc.code == 429 or exc.code in RETRYABLE_API_CODES
|
|
|
|
|
|
def _conn_pool() -> dict:
|
|
pool = getattr(_conn_local, "pool", None)
|
|
if pool is None:
|
|
pool = {}
|
|
_conn_local.pool = pool
|
|
return pool
|
|
|
|
|
|
def _drop_conn(key) -> None:
|
|
conn = _conn_pool().pop(key, None)
|
|
if conn is not None:
|
|
try:
|
|
conn.close()
|
|
except OSError:
|
|
pass
|
|
|
|
|
|
def _get_conn(scheme, host, port, timeout):
|
|
key = (scheme, host, port)
|
|
pool = _conn_pool()
|
|
conn = pool.get(key)
|
|
if conn is None:
|
|
if scheme == "https":
|
|
conn = http.client.HTTPSConnection(host, port or 443, timeout=timeout)
|
|
else:
|
|
conn = http.client.HTTPConnection(host, port or 80, timeout=timeout)
|
|
pool[key] = conn
|
|
else:
|
|
conn.timeout = timeout
|
|
return conn, key
|
|
|
|
|
|
def _content_length(data):
|
|
try:
|
|
return len(data)
|
|
except (TypeError, AttributeError):
|
|
return None
|
|
|
|
|
|
def _send_once(method, url, *, headers=None, data=None, timeout=60, _redirects=5):
|
|
"""One HTTP request over a reused keep-alive connection.
|
|
|
|
Returns ``(status_code, body_bytes, retry_after)``. Raises ``urllib.error.URLError``
|
|
on a network-level failure (so the retry layer can back off).
|
|
"""
|
|
parts = urllib.parse.urlsplit(url)
|
|
scheme, host, port = parts.scheme, parts.hostname, parts.port
|
|
path = urllib.parse.urlunsplit(("", "", parts.path or "/", parts.query, "")) or "/"
|
|
send_headers = dict(headers or {})
|
|
send_headers.setdefault("User-Agent", USER_AGENT)
|
|
if data is not None and "Content-Length" not in send_headers:
|
|
length = _content_length(data)
|
|
if length is not None:
|
|
send_headers["Content-Length"] = str(length)
|
|
|
|
# Try once on the pooled (possibly stale) connection; on a connection-level
|
|
# error reconnect once before surfacing it to the retry layer.
|
|
for stale_attempt in (0, 1):
|
|
conn, key = _get_conn(scheme, host, port, timeout)
|
|
try:
|
|
conn.request(method, path, body=data, headers=send_headers)
|
|
resp = conn.getresponse()
|
|
status = resp.status
|
|
body = resp.read()
|
|
if resp.getheader("Connection", "").lower() == "close" or resp.version == 10:
|
|
_drop_conn(key)
|
|
if status in (301, 302, 303, 307, 308) and _redirects > 0:
|
|
location = resp.getheader("Location")
|
|
if location:
|
|
nxt = urllib.parse.urljoin(url, location)
|
|
nmethod = "GET" if status in (301, 302, 303) and method != "HEAD" else method
|
|
ndata = None if nmethod != method else data
|
|
if ndata is not None and hasattr(ndata, "seek"):
|
|
try:
|
|
ndata.seek(0)
|
|
except OSError:
|
|
pass
|
|
return _send_once(nmethod, nxt, headers=headers, data=ndata,
|
|
timeout=timeout, _redirects=_redirects - 1)
|
|
return status, body, resp.getheader("Retry-After")
|
|
except (http.client.HTTPException, ConnectionError, OSError) as exc:
|
|
_drop_conn(key)
|
|
if stale_attempt == 0:
|
|
if hasattr(data, "seek"):
|
|
try:
|
|
data.seek(0)
|
|
except OSError:
|
|
pass
|
|
continue # pooled connection was stale — reconnect and retry once
|
|
raise urllib.error.URLError(exc)
|
|
raise urllib.error.URLError("connection failed") # pragma: no cover - defensive
|
|
|
|
|
|
def _http(method, url, *, headers=None, data=None, timeout=60):
|
|
"""Perform one HTTP request with bounded exponential backoff on transient
|
|
failures (429/5xx/network). Returns ``(status_code, body_bytes)``.
|
|
|
|
Business errors (HTTP 200 with ``code != 0``) are *not* retried here — that is
|
|
the caller's concern in :func:`_api_json`.
|
|
"""
|
|
for attempt in range(RETRY_MAX_ATTEMPTS):
|
|
if hasattr(data, "seek"):
|
|
try:
|
|
data.seek(0)
|
|
except OSError:
|
|
pass
|
|
status = None
|
|
retry_after = None
|
|
last_exc = None
|
|
try:
|
|
status, body, retry_after = _send_once(
|
|
method, url, headers=headers, data=data, timeout=timeout
|
|
)
|
|
except urllib.error.URLError as exc:
|
|
last_exc = exc
|
|
if status is not None and not _should_retry_status(status):
|
|
return status, body
|
|
if attempt + 1 < RETRY_MAX_ATTEMPTS:
|
|
time.sleep(_backoff_delay(attempt, retry_after if status is not None else None))
|
|
continue
|
|
# Retries exhausted.
|
|
if status == 429:
|
|
raise MinerUError(
|
|
"Rate limited (HTTP 429) — slow down, lower --workers, or set a token",
|
|
code=429,
|
|
)
|
|
if status is not None:
|
|
return status, body # surface the last 5xx body so _api_json can report it
|
|
raise MinerUError(f"Network error after {RETRY_MAX_ATTEMPTS} attempts: {last_exc}")
|
|
|
|
|
|
def _api_json(method, url, *, token=None, payload=None, timeout=60) -> dict:
|
|
"""Call a MinerU JSON endpoint and return ``data``, raising on ``code != 0``."""
|
|
headers = {"Accept": "*/*"}
|
|
body = None
|
|
if payload is not None:
|
|
headers["Content-Type"] = "application/json"
|
|
body = json.dumps(payload).encode("utf-8")
|
|
if token:
|
|
headers["Authorization"] = f"Bearer {token}"
|
|
for attempt in range(RETRY_MAX_ATTEMPTS):
|
|
status, raw = _http(method, url, headers=headers, data=body, timeout=timeout)
|
|
try:
|
|
parsed = json.loads(raw.decode("utf-8"))
|
|
except (ValueError, UnicodeDecodeError):
|
|
raise MinerUError(f"Non-JSON response (HTTP {status}) from {url}")
|
|
# MinerU returns two envelopes: the business layer uses {code, data, msg}
|
|
# while the auth/gateway layer uses {success, msgCode, msg} (e.g. on a bad
|
|
# token). Handle both so credential errors surface clearly.
|
|
if parsed.get("success") is False:
|
|
code = parsed.get("msgCode") or parsed.get("code")
|
|
hint = ERROR_HINTS.get(code) or parsed.get("msg") or error_hint(code)
|
|
raise MinerUError(hint, code=code)
|
|
code = parsed.get("code")
|
|
if not (200 <= status < 300) and code in (0, None):
|
|
raise MinerUError(f"HTTP {status} from {url}", code=status)
|
|
if code not in (0, None):
|
|
if code in RETRYABLE_API_CODES and attempt + 1 < RETRY_MAX_ATTEMPTS:
|
|
time.sleep(_backoff_delay(attempt))
|
|
continue
|
|
raise MinerUError(error_hint(code), code=code)
|
|
data = parsed.get("data")
|
|
if data is None and code is None and "success" not in parsed:
|
|
raise MinerUError(f"Unexpected response (HTTP {status}) from {url}")
|
|
return data or {}
|
|
raise MinerUError(f"API retry exhausted for {url}") # pragma: no cover - defensive
|
|
|
|
|
|
def _put_file(upload_url: str, path: str, timeout=300) -> None:
|
|
"""Upload a local file to a signed OSS URL (no Content-Type per docs)."""
|
|
headers = {"Content-Length": str(os.path.getsize(path))}
|
|
with open(path, "rb") as handle:
|
|
status, _ = _http("PUT", upload_url, headers=headers, data=handle, timeout=timeout)
|
|
if status not in (200, 201, 203):
|
|
raise MinerUError(f"Upload failed (HTTP {status})")
|
|
|
|
|
|
def _download(url: str, timeout=300) -> bytes:
|
|
status, raw = _http("GET", url, timeout=timeout)
|
|
if status != 200:
|
|
raise MinerUError(f"Download failed (HTTP {status})")
|
|
return raw
|
|
|
|
|
|
def _download_to_path(url: str, dest: Path, *, timeout=300) -> Path:
|
|
"""Stream a (potentially large) download straight to disk in chunks.
|
|
|
|
Used for result zips so a batch worker never buffers the whole archive in RAM.
|
|
Retries transient failures with backoff, mirroring :func:`_http`.
|
|
"""
|
|
for attempt in range(RETRY_MAX_ATTEMPTS):
|
|
try:
|
|
req = urllib.request.Request(url, method="GET", headers={"User-Agent": USER_AGENT})
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
if resp.getcode() != 200:
|
|
raise MinerUError(f"Download failed (HTTP {resp.getcode()})")
|
|
with open(dest, "wb") as handle:
|
|
while True:
|
|
chunk = resp.read(65536)
|
|
if not chunk:
|
|
break
|
|
handle.write(chunk)
|
|
return dest
|
|
except urllib.error.HTTPError as exc:
|
|
if _should_retry_status(exc.code) and attempt + 1 < RETRY_MAX_ATTEMPTS:
|
|
time.sleep(_backoff_delay(attempt, exc.headers.get("Retry-After")))
|
|
continue
|
|
raise MinerUError(f"Download failed (HTTP {exc.code})", code=exc.code)
|
|
except urllib.error.URLError as exc:
|
|
if attempt + 1 < RETRY_MAX_ATTEMPTS:
|
|
time.sleep(_backoff_delay(attempt))
|
|
continue
|
|
raise MinerUError(f"Download failed: {exc}")
|
|
raise MinerUError("Download failed after retries") # pragma: no cover - defensive
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Agent API (lightweight, no token)
|
|
# --------------------------------------------------------------------------- #
|
|
def _agent_payload(opts: ParseOptions) -> dict:
|
|
payload = {
|
|
"language": opts.language,
|
|
"enable_table": opts.enable_table,
|
|
"is_ocr": opts.is_ocr,
|
|
"enable_formula": opts.enable_formula,
|
|
}
|
|
page_range = to_agent_page_range(opts.page_ranges)
|
|
if page_range:
|
|
payload["page_range"] = page_range
|
|
return payload
|
|
|
|
|
|
def agent_parse(source: str, opts: ParseOptions, *, poll_interval=3, timeout=600):
|
|
"""Parse one URL or file via the Agent API. Returns the Markdown text."""
|
|
if is_url(source):
|
|
payload = {"url": source, **_agent_payload(opts)}
|
|
data = _api_json("POST", f"{AGENT_API}/parse/url", payload=payload)
|
|
task_id = data["task_id"]
|
|
else:
|
|
payload = {"file_name": Path(source).name, **_agent_payload(opts)}
|
|
data = _api_json("POST", f"{AGENT_API}/parse/file", payload=payload)
|
|
task_id = data["task_id"]
|
|
_put_file(data["file_url"], source, timeout=timeout)
|
|
markdown = _agent_poll(task_id, poll_interval=poll_interval, timeout=timeout)
|
|
return markdown, task_id
|
|
|
|
|
|
def _agent_poll(task_id, *, poll_interval, timeout) -> str:
|
|
deadline = time.monotonic() + timeout
|
|
while time.monotonic() < deadline:
|
|
data = _api_json("GET", f"{AGENT_API}/parse/{task_id}", timeout=timeout)
|
|
state = data.get("state")
|
|
if state == STATE_DONE:
|
|
return _download(data["markdown_url"], timeout=timeout).decode("utf-8", errors="replace")
|
|
if state == STATE_FAILED:
|
|
raise MinerUError(
|
|
data.get("err_msg") or error_hint(data.get("err_code")),
|
|
code=data.get("err_code"),
|
|
)
|
|
time.sleep(poll_interval)
|
|
raise MinerUError("Agent parse timed out")
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Standard API (v4, token required)
|
|
# --------------------------------------------------------------------------- #
|
|
def _standard_model(opts: ParseOptions, source: str) -> str:
|
|
return "MinerU-HTML" if is_html(source) else opts.model
|
|
|
|
|
|
def _standard_submit_and_poll(source, opts, token, *, poll_interval=3, timeout=600):
|
|
"""Submit one URL/file to the Standard API and poll to completion.
|
|
|
|
Returns ``(full_zip_url, task_or_batch_id)`` — the caller decides whether to
|
|
buffer (small) or stream (large) the result zip. The submit POST uses a capped
|
|
socket timeout; the full ``timeout`` stays the parse budget for polling.
|
|
"""
|
|
model = _standard_model(opts, source)
|
|
req_timeout = min(timeout, REQUEST_TIMEOUT_CAP)
|
|
if is_url(source):
|
|
payload = {
|
|
"url": source,
|
|
"model_version": model,
|
|
"is_ocr": opts.is_ocr,
|
|
"enable_formula": opts.enable_formula,
|
|
"enable_table": opts.enable_table,
|
|
"language": opts.language,
|
|
}
|
|
if opts.page_ranges:
|
|
payload["page_ranges"] = opts.page_ranges
|
|
if opts.extra_formats:
|
|
payload["extra_formats"] = list(opts.extra_formats)
|
|
data = _api_json("POST", f"{STANDARD_API}/extract/task", token=token,
|
|
payload=payload, timeout=req_timeout)
|
|
zip_url = _standard_poll_task(data["task_id"], token, poll_interval=poll_interval, timeout=timeout)
|
|
return zip_url, data["task_id"]
|
|
|
|
# Local file: request a signed upload URL, PUT the bytes, then poll the batch.
|
|
file_entry = {"name": Path(source).name, "data_id": safe_data_id(safe_stem(source))}
|
|
if opts.is_ocr:
|
|
file_entry["is_ocr"] = True
|
|
if opts.page_ranges:
|
|
file_entry["page_ranges"] = opts.page_ranges
|
|
payload = {
|
|
"files": [file_entry],
|
|
"model_version": model,
|
|
"enable_formula": opts.enable_formula,
|
|
"enable_table": opts.enable_table,
|
|
"language": opts.language,
|
|
}
|
|
if opts.extra_formats:
|
|
payload["extra_formats"] = list(opts.extra_formats)
|
|
data = _api_json("POST", f"{STANDARD_API}/file-urls/batch", token=token,
|
|
payload=payload, timeout=req_timeout)
|
|
batch_id = data["batch_id"]
|
|
_put_file(data["file_urls"][0], source, timeout=timeout)
|
|
zip_url = _standard_poll_batch(batch_id, token, Path(source).name, poll_interval=poll_interval, timeout=timeout)
|
|
return zip_url, batch_id
|
|
|
|
|
|
def standard_parse(
|
|
source: str, opts: ParseOptions, token: str, *, poll_interval=3, timeout=600
|
|
):
|
|
"""Parse one URL or file via the Standard API. Returns ``(zip_bytes, id)``."""
|
|
zip_url, task_id = _standard_submit_and_poll(
|
|
source, opts, token, poll_interval=poll_interval, timeout=timeout
|
|
)
|
|
return _download(zip_url, timeout=timeout), task_id
|
|
|
|
|
|
def _standard_parse_to_dir(source, opts, token, out_stem, output_dir, *, poll_interval=3, timeout=600):
|
|
"""Standard parse of a single input, streaming the result zip to disk rather
|
|
than buffering the whole archive in RAM. Returns ``(md_path, id)``."""
|
|
zip_url, task_id = _standard_submit_and_poll(
|
|
source, opts, token, poll_interval=poll_interval, timeout=timeout
|
|
)
|
|
target_dir = output_dir / out_stem
|
|
target_dir.mkdir(parents=True, exist_ok=True)
|
|
zip_path = target_dir / "._result.zip.partial"
|
|
_download_to_path(zip_url, zip_path, timeout=timeout)
|
|
md_path = extract_zip_path(out_stem, zip_path, output_dir)
|
|
try:
|
|
zip_path.unlink()
|
|
except OSError:
|
|
pass
|
|
return md_path, task_id
|
|
|
|
|
|
def _standard_poll_task(task_id, token, *, poll_interval, timeout) -> str:
|
|
deadline = time.monotonic() + timeout
|
|
req_timeout = min(timeout, REQUEST_TIMEOUT_CAP)
|
|
while time.monotonic() < deadline:
|
|
data = _api_json("GET", f"{STANDARD_API}/extract/task/{task_id}", token=token,
|
|
timeout=req_timeout)
|
|
state = data.get("state")
|
|
if state == STATE_DONE:
|
|
return data["full_zip_url"]
|
|
if state == STATE_FAILED:
|
|
raise MinerUError(_result_error(data), code=data.get("err_code"))
|
|
time.sleep(poll_interval)
|
|
raise MinerUError("Standard parse timed out")
|
|
|
|
|
|
def _standard_poll_batch(batch_id, token, file_name, *, poll_interval, timeout) -> str:
|
|
deadline = time.monotonic() + timeout
|
|
req_timeout = min(timeout, REQUEST_TIMEOUT_CAP)
|
|
while time.monotonic() < deadline:
|
|
data = _api_json("GET", f"{STANDARD_API}/extract-results/batch/{batch_id}",
|
|
token=token, timeout=req_timeout)
|
|
for entry in data.get("extract_result", []):
|
|
if entry.get("file_name") != file_name:
|
|
continue
|
|
state = entry.get("state")
|
|
if state == STATE_DONE:
|
|
return entry["full_zip_url"]
|
|
if state == STATE_FAILED:
|
|
raise MinerUError(_result_error(entry), code=entry.get("err_code"))
|
|
time.sleep(poll_interval)
|
|
raise MinerUError("Standard parse timed out")
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Output writing
|
|
# --------------------------------------------------------------------------- #
|
|
def write_markdown(stem: str, markdown: str, output_dir: Path) -> Path:
|
|
"""Write a bare Markdown string (Agent API result) to ``<dir>/<stem>/<stem>.md``.
|
|
|
|
The write is atomic (temp file + ``os.replace``) so an interrupted run can never
|
|
leave a half-written ``.md`` that ``--resume`` would mistake for a finished file.
|
|
"""
|
|
target_dir = output_dir / stem
|
|
target_dir.mkdir(parents=True, exist_ok=True)
|
|
md_path = target_dir / f"{stem}.md"
|
|
tmp_path = target_dir / f".{stem}.md.partial"
|
|
tmp_path.write_text(markdown, encoding="utf-8")
|
|
os.replace(tmp_path, md_path)
|
|
return md_path
|
|
|
|
|
|
def _finalize_zip_dir(target_dir: Path, stem: str) -> Path:
|
|
"""Rename the archive's ``full.md`` to ``<stem>.md``.
|
|
|
|
With no ``full.md`` (and no already-correct ``<stem>.md``), pick the largest
|
|
``*.md`` — the most likely full body — tie-broken by name, so the choice is
|
|
deterministic instead of filesystem ``glob`` order.
|
|
"""
|
|
full_md = target_dir / "full.md"
|
|
md_path = target_dir / f"{stem}.md"
|
|
if full_md.exists():
|
|
full_md.replace(md_path)
|
|
elif not md_path.exists():
|
|
candidates = sorted(target_dir.glob("*.md"), key=lambda p: (-p.stat().st_size, p.name))
|
|
if candidates:
|
|
candidates[0].replace(md_path)
|
|
return md_path
|
|
|
|
|
|
def _validate_zip_member(info: zipfile.ZipInfo) -> None:
|
|
name = info.filename
|
|
parts = PurePosixPath(name).parts
|
|
if PurePosixPath(name).is_absolute() or ".." in parts:
|
|
raise MinerUError(f"Unsafe zip member path: {name}")
|
|
file_type = (info.external_attr >> 16) & 0o170000
|
|
if file_type == 0o120000:
|
|
raise MinerUError(f"Unsafe zip symlink: {name}")
|
|
|
|
|
|
def _safe_extract_zip(archive: zipfile.ZipFile, target_dir: Path) -> None:
|
|
for info in archive.infolist():
|
|
_validate_zip_member(info)
|
|
archive.extractall(target_dir)
|
|
|
|
|
|
def write_zip(stem: str, zip_bytes: bytes, output_dir: Path) -> Path:
|
|
"""Extract a Standard API result zip and return the path to the renamed Markdown."""
|
|
target_dir = output_dir / stem
|
|
target_dir.mkdir(parents=True, exist_ok=True)
|
|
with zipfile.ZipFile(BytesIO(zip_bytes)) as archive:
|
|
_safe_extract_zip(archive, target_dir)
|
|
return _finalize_zip_dir(target_dir, stem)
|
|
|
|
|
|
def extract_zip_path(stem: str, zip_path: Path, output_dir: Path) -> Path:
|
|
"""Extract a result zip already on disk (streamed download) without buffering it."""
|
|
target_dir = output_dir / stem
|
|
target_dir.mkdir(parents=True, exist_ok=True)
|
|
with zipfile.ZipFile(zip_path) as archive:
|
|
_safe_extract_zip(archive, target_dir)
|
|
return _finalize_zip_dir(target_dir, stem)
|
|
|
|
|
|
def copy_to_obsidian(md_path: Path, stem: str, vault: Path) -> Path:
|
|
"""Copy the parsed Markdown (and sibling images) into an Obsidian vault folder."""
|
|
vault.mkdir(parents=True, exist_ok=True)
|
|
dest = vault / f"{stem}.md"
|
|
dest.write_text(md_path.read_text(encoding="utf-8"), encoding="utf-8")
|
|
images = md_path.parent / "images"
|
|
if images.is_dir():
|
|
dest_images = vault / "images"
|
|
dest_images.mkdir(exist_ok=True)
|
|
for img in images.iterdir():
|
|
if img.is_file():
|
|
(dest_images / img.name).write_bytes(img.read_bytes())
|
|
return dest
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Per-input orchestration
|
|
# --------------------------------------------------------------------------- #
|
|
def process_one(
|
|
source: str,
|
|
opts: ParseOptions,
|
|
*,
|
|
token: Optional[str],
|
|
output_dir: Path,
|
|
api: str = "auto",
|
|
obsidian: Optional[Path] = None,
|
|
resume: bool = False,
|
|
poll_interval: float = 3,
|
|
timeout: float = 600,
|
|
engine: str = "cloud",
|
|
out_stem: Optional[str] = None,
|
|
) -> ParseResult:
|
|
"""Parse a single input end to end, choosing the backend and writing output."""
|
|
stem = safe_stem(source)
|
|
out_stem = out_stem or stem # on-disk identity (disambiguated by the caller on collision)
|
|
result = ParseResult(name=stem, source=source, modality=detect_modality(source))
|
|
started = time.monotonic()
|
|
|
|
if resume and (output_dir / out_stem / f"{out_stem}.md").exists():
|
|
result.state = "skipped"
|
|
result.output_dir = str(output_dir / out_stem)
|
|
result.markdown_path = str(output_dir / out_stem / f"{out_stem}.md")
|
|
return result
|
|
|
|
if engine in ("local", "auto") and not is_url(source) and suffix_of(source) == ".pdf":
|
|
local = _load_local_engine()
|
|
use_local = engine == "local"
|
|
if engine == "auto" and local is not None:
|
|
try:
|
|
use_local = local.available() and local.is_born_digital(source)
|
|
except Exception:
|
|
use_local = False
|
|
if use_local and local is not None:
|
|
try:
|
|
markdown = local.parse_local(source)
|
|
md_path = write_markdown(out_stem, markdown, output_dir)
|
|
result.api = "local"
|
|
result.markdown = markdown
|
|
return _finalize(result, out_stem, output_dir, md_path, obsidian, started)
|
|
except Exception as exc: # LocalEngineError or parse failure
|
|
if engine == "local":
|
|
result.state = STATE_FAILED
|
|
result.error = str(exc)
|
|
return result
|
|
# auto: fall through to the cloud engine
|
|
|
|
size_bytes = None
|
|
if not is_url(source):
|
|
try:
|
|
size_bytes = os.path.getsize(source)
|
|
except OSError:
|
|
pass
|
|
|
|
chosen = choose_api(
|
|
token=token,
|
|
source=source,
|
|
size_bytes=size_bytes,
|
|
batch=False,
|
|
extra_formats=opts.extra_formats,
|
|
explicit=api,
|
|
)
|
|
if chosen == "agent" and api == "auto" and token and _agent_page_limit_exceeded(source, opts):
|
|
chosen = "standard"
|
|
|
|
precheck_error = _precheck_limits(source, chosen, opts)
|
|
if precheck_error:
|
|
result.api = chosen
|
|
result.state = STATE_FAILED
|
|
result.error = precheck_error
|
|
return result
|
|
|
|
try:
|
|
result.api = chosen
|
|
if chosen == "standard":
|
|
if not token:
|
|
raise MinerUError(
|
|
"Standard API needs a token — set MINERU_TOKEN "
|
|
"(https://mineru.net/apiManage/token)"
|
|
)
|
|
md_path, task_id = _standard_parse_to_dir(
|
|
source, opts, token, out_stem, output_dir,
|
|
poll_interval=poll_interval, timeout=timeout
|
|
)
|
|
result.task_id = task_id
|
|
result.markdown = md_path.read_text(encoding="utf-8") if md_path.exists() else ""
|
|
else:
|
|
try:
|
|
markdown, task_id = agent_parse(
|
|
source, opts, poll_interval=poll_interval, timeout=timeout
|
|
)
|
|
except MinerUError as exc:
|
|
# Auto-escalate to the Standard API when a token is available.
|
|
if api == "auto" and token and exc.code in AGENT_ESCALATABLE:
|
|
result.api = "standard"
|
|
md_path, task_id = _standard_parse_to_dir(
|
|
source, opts, token, out_stem, output_dir,
|
|
poll_interval=poll_interval, timeout=timeout
|
|
)
|
|
result.task_id = task_id
|
|
result.markdown = md_path.read_text(encoding="utf-8") if md_path.exists() else ""
|
|
return _finalize(result, out_stem, output_dir, md_path, obsidian, started)
|
|
raise
|
|
result.task_id = task_id
|
|
md_path = write_markdown(out_stem, markdown, output_dir)
|
|
result.markdown = markdown
|
|
return _finalize(result, out_stem, output_dir, md_path, obsidian, started)
|
|
except MinerUError as exc:
|
|
result.state = STATE_FAILED
|
|
result.error = str(exc)
|
|
return result
|
|
except (OSError, urllib.error.URLError) as exc:
|
|
result.state = STATE_FAILED
|
|
result.error = str(exc)
|
|
return result
|
|
except Exception as exc: # noqa: BLE001 - one malformed response must not abort the batch
|
|
result.state = STATE_FAILED
|
|
result.error = f"{type(exc).__name__}: {exc}"
|
|
return result
|
|
|
|
|
|
def _finalize(result, out_stem, output_dir, md_path, obsidian, started=None) -> ParseResult:
|
|
# A result that yielded no markdown on disk is a failure, not a silent "done"
|
|
# pointing at a nonexistent/empty body.
|
|
if md_path is None or not Path(md_path).exists():
|
|
result.state = STATE_FAILED
|
|
result.error = result.error or "no markdown in result"
|
|
return result
|
|
result.state = STATE_DONE
|
|
result.output_dir = str(output_dir / out_stem)
|
|
result.markdown_path = str(md_path)
|
|
if started is not None:
|
|
result.elapsed = round(time.monotonic() - started, 2)
|
|
if obsidian is not None:
|
|
copy_to_obsidian(md_path, out_stem, obsidian)
|
|
return result
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Multi-input pipeline: decoupled submit -> poll -> download
|
|
# --------------------------------------------------------------------------- #
|
|
# The single-file primitives above each block a thread for the whole parse
|
|
# (submit, then ``time.sleep`` poll loop). That caps real concurrency at the
|
|
# worker count and wastes threads sleeping. For batches we instead:
|
|
# 1. submit every input up front (Standard local files coalesced into one
|
|
# ``/file-urls/batch`` call of up to 50 files — the endpoint's real purpose),
|
|
# 2. poll all outstanding tickets from one place with adaptive backoff (a single
|
|
# batch poll returns the state of every file in that batch), and
|
|
# 3. download + extract completed results in a small pool as soon as each is done.
|
|
# Nothing sleeps holding a parse slot, and Standard API/poll traffic collapses ~Nx.
|
|
@dataclass
|
|
class _Job:
|
|
source: str
|
|
stem: str
|
|
api: str
|
|
is_url: bool
|
|
result: ParseResult
|
|
out_stem: str = ""
|
|
data_id: str = ""
|
|
file_name: str = ""
|
|
poll_kind: str = "" # "agent" | "task" | "batch"
|
|
poll_id: str = ""
|
|
download_url: str = ""
|
|
download_kind: str = "" # "md" | "zip"
|
|
deadline: float = 0.0
|
|
started: float = 0.0
|
|
finished: bool = False # resolved during planning/submit (skip or hard-fail)
|
|
|
|
|
|
def _plan_jobs(sources, opts, *, token, output_dir, api, resume, batch_mode=False) -> list:
|
|
"""Resolve each source's backend and short-circuit already-parsed inputs."""
|
|
jobs = []
|
|
out_stems = unique_out_stems(sources)
|
|
for idx, src in enumerate(sources):
|
|
stem = safe_stem(src)
|
|
out_stem = out_stems[idx]
|
|
res = ParseResult(name=stem, source=src, modality=detect_modality(src))
|
|
url = is_url(src)
|
|
job = _Job(
|
|
source=src, stem=stem, out_stem=out_stem, api="agent", is_url=url, result=res,
|
|
file_name=(safe_stem(src) if url else Path(src).name),
|
|
data_id=f"{safe_data_id(stem)}-{idx}",
|
|
)
|
|
if resume and (output_dir / out_stem / f"{out_stem}.md").exists():
|
|
res.state = "skipped"
|
|
res.output_dir = str(output_dir / out_stem)
|
|
res.markdown_path = str(output_dir / out_stem / f"{out_stem}.md")
|
|
job.finished = True
|
|
jobs.append(job)
|
|
continue
|
|
size_bytes = None
|
|
if not url:
|
|
try:
|
|
size_bytes = os.path.getsize(src)
|
|
except OSError:
|
|
pass
|
|
job.api = choose_api(
|
|
token=token, source=src, size_bytes=size_bytes, batch=batch_mode,
|
|
extra_formats=opts.extra_formats, explicit=api,
|
|
)
|
|
if job.api == "agent" and api == "auto" and token and _agent_page_limit_exceeded(src, opts):
|
|
job.api = "standard"
|
|
res.api = job.api
|
|
precheck_error = _precheck_limits(src, job.api, opts)
|
|
if precheck_error:
|
|
res.state = STATE_FAILED
|
|
res.error = precheck_error
|
|
job.finished = True
|
|
jobs.append(job)
|
|
return jobs
|
|
|
|
|
|
def _reserve_single(job, opts, token, timeout) -> list:
|
|
"""Submit one non-batch input; return upload work ``[(job, url)]`` (empty for URLs)."""
|
|
src = job.source
|
|
job.deadline = time.monotonic() + timeout
|
|
req_timeout = min(timeout, REQUEST_TIMEOUT_CAP)
|
|
if job.api == "agent":
|
|
if job.is_url:
|
|
data = _api_json("POST", f"{AGENT_API}/parse/url",
|
|
payload={"url": src, **_agent_payload(opts)}, timeout=req_timeout)
|
|
job.poll_kind, job.poll_id = "agent", data["task_id"]
|
|
job.result.task_id = data["task_id"]
|
|
return []
|
|
data = _api_json("POST", f"{AGENT_API}/parse/file",
|
|
payload={"file_name": Path(src).name, **_agent_payload(opts)},
|
|
timeout=req_timeout)
|
|
job.poll_kind, job.poll_id = "agent", data["task_id"]
|
|
job.result.task_id = data["task_id"]
|
|
return [(job, data["file_url"])]
|
|
# Standard API URL -> single extract task (only local files coalesce into batches).
|
|
model = _standard_model(opts, src)
|
|
payload = {
|
|
"url": src, "model_version": model, "is_ocr": opts.is_ocr,
|
|
"enable_formula": opts.enable_formula, "enable_table": opts.enable_table,
|
|
"language": opts.language,
|
|
}
|
|
if opts.page_ranges:
|
|
payload["page_ranges"] = opts.page_ranges
|
|
if opts.extra_formats:
|
|
payload["extra_formats"] = list(opts.extra_formats)
|
|
data = _api_json("POST", f"{STANDARD_API}/extract/task", token=token,
|
|
payload=payload, timeout=req_timeout)
|
|
job.poll_kind, job.poll_id = "task", data["task_id"]
|
|
job.result.task_id = data["task_id"]
|
|
return []
|
|
|
|
|
|
def _reserve_batch(batch_jobs, opts, token, timeout) -> list:
|
|
"""Submit one ``/file-urls/batch`` of up to 50 files; return upload work list."""
|
|
model = _standard_model(opts, batch_jobs[0].source)
|
|
files = []
|
|
for job in batch_jobs:
|
|
entry = {"name": Path(job.source).name, "data_id": job.data_id}
|
|
if opts.is_ocr:
|
|
entry["is_ocr"] = True
|
|
if opts.page_ranges:
|
|
entry["page_ranges"] = opts.page_ranges
|
|
files.append(entry)
|
|
payload = {
|
|
"files": files, "model_version": model,
|
|
"enable_formula": opts.enable_formula, "enable_table": opts.enable_table,
|
|
"language": opts.language,
|
|
}
|
|
if opts.extra_formats:
|
|
payload["extra_formats"] = list(opts.extra_formats)
|
|
data = _api_json("POST", f"{STANDARD_API}/file-urls/batch", token=token,
|
|
payload=payload, timeout=min(timeout, REQUEST_TIMEOUT_CAP))
|
|
batch_id = data["batch_id"]
|
|
urls = data["file_urls"]
|
|
if len(urls) != len(batch_jobs):
|
|
raise MinerUError(
|
|
f"Batch upload URL count mismatch: requested {len(batch_jobs)}, got {len(urls)}"
|
|
)
|
|
deadline = time.monotonic() + timeout
|
|
uploads = []
|
|
for job, url in zip(batch_jobs, urls):
|
|
if isinstance(url, dict):
|
|
url = url.get("url") or url.get("file_url") or url.get("upload_url")
|
|
if not url:
|
|
raise MinerUError(f"Missing upload URL for {job.file_name or job.source}")
|
|
job.poll_kind, job.poll_id = "batch", batch_id
|
|
job.result.task_id = batch_id
|
|
job.deadline = deadline
|
|
uploads.append((job, url))
|
|
return uploads
|
|
|
|
|
|
def _reserve_url_batch(batch_jobs, opts, token, timeout) -> list:
|
|
"""Submit one Standard ``/extract/task/batch`` for URL inputs."""
|
|
model = _standard_model(opts, batch_jobs[0].source)
|
|
files = [{"url": job.source, "data_id": job.data_id} for job in batch_jobs]
|
|
payload = {"files": files, "model_version": model}
|
|
data = _api_json("POST", f"{STANDARD_API}/extract/task/batch", token=token,
|
|
payload=payload, timeout=min(timeout, REQUEST_TIMEOUT_CAP))
|
|
batch_id = data["batch_id"]
|
|
deadline = time.monotonic() + timeout
|
|
for job in batch_jobs:
|
|
job.poll_kind, job.poll_id = "batch", batch_id
|
|
job.result.task_id = batch_id
|
|
job.deadline = deadline
|
|
return []
|
|
|
|
|
|
def _can_batch_standard_urls(opts: ParseOptions) -> bool:
|
|
"""URL batch endpoint has a narrower documented payload than single URL tasks."""
|
|
return (
|
|
not opts.is_ocr
|
|
and opts.enable_formula
|
|
and opts.enable_table
|
|
and opts.language == "ch"
|
|
and not opts.page_ranges
|
|
and not opts.extra_formats
|
|
)
|
|
|
|
|
|
def _chunk_standard_jobs(std_jobs, opts, batch_size) -> list:
|
|
"""Group Standard-API jobs into batches, splitting by model_version."""
|
|
by_model: dict = {}
|
|
for job in std_jobs:
|
|
by_model.setdefault(_standard_model(opts, job.source), []).append(job)
|
|
chunks = []
|
|
size = max(1, min(batch_size, BATCH_MAX_FILES))
|
|
for group in by_model.values():
|
|
for i in range(0, len(group), size):
|
|
chunks.append(group[i:i + size])
|
|
return chunks
|
|
|
|
|
|
def _chunk_standard_files(std_jobs, opts, batch_size) -> list:
|
|
"""Backward-compatible wrapper for tests/importers."""
|
|
return _chunk_standard_jobs(std_jobs, opts, batch_size)
|
|
|
|
|
|
def _poll_group(kind, poll_id, group, token, *, timeout=60):
|
|
"""Poll one ticket; return ``(completed_jobs, failed_jobs)`` (state set on each)."""
|
|
completed, failed = [], []
|
|
if kind == "agent":
|
|
data = _api_json("GET", f"{AGENT_API}/parse/{poll_id}", timeout=timeout)
|
|
job = group[0]
|
|
state = data.get("state")
|
|
if state == STATE_DONE:
|
|
job.download_url, job.download_kind = data["markdown_url"], "md"
|
|
completed.append(job)
|
|
elif state == STATE_FAILED:
|
|
job.result.error = _result_error(data)
|
|
failed.append(job)
|
|
elif kind == "task":
|
|
data = _api_json("GET", f"{STANDARD_API}/extract/task/{poll_id}", token=token,
|
|
timeout=timeout)
|
|
job = group[0]
|
|
state = data.get("state")
|
|
if state == STATE_DONE:
|
|
job.download_url, job.download_kind = data["full_zip_url"], "zip"
|
|
completed.append(job)
|
|
elif state == STATE_FAILED:
|
|
job.result.error = _result_error(data)
|
|
failed.append(job)
|
|
else: # batch — one GET reports every file in the batch
|
|
data = _api_json("GET", f"{STANDARD_API}/extract-results/batch/{poll_id}",
|
|
token=token, timeout=timeout)
|
|
by_data, by_name = {}, {}
|
|
for entry in data.get("extract_result", []):
|
|
if entry.get("data_id"):
|
|
by_data[entry["data_id"]] = entry
|
|
if entry.get("file_name"):
|
|
by_name.setdefault(entry["file_name"], entry)
|
|
group_name_counts = {}
|
|
for job in group:
|
|
group_name_counts[job.file_name] = group_name_counts.get(job.file_name, 0) + 1
|
|
for job in group:
|
|
entry = by_data.get(job.data_id)
|
|
if entry is None and group_name_counts.get(job.file_name) == 1:
|
|
entry = by_name.get(job.file_name)
|
|
if not entry:
|
|
continue
|
|
state = entry.get("state")
|
|
if state == STATE_DONE:
|
|
job.download_url, job.download_kind = entry["full_zip_url"], "zip"
|
|
completed.append(job)
|
|
elif state == STATE_FAILED:
|
|
job.result.error = _result_error(entry)
|
|
failed.append(job)
|
|
return completed, failed
|
|
|
|
|
|
def _download_and_write(job, opts, output_dir, *, obsidian, want_markdown, timeout=300):
|
|
"""Download a completed result, write it, and finalize the job's ParseResult."""
|
|
try:
|
|
if job.download_kind == "md":
|
|
markdown = _download(job.download_url, timeout=timeout).decode("utf-8", errors="replace")
|
|
md_path = write_markdown(job.out_stem, markdown, output_dir)
|
|
if want_markdown:
|
|
job.result.markdown = markdown
|
|
else:
|
|
target_dir = output_dir / job.out_stem
|
|
target_dir.mkdir(parents=True, exist_ok=True)
|
|
zip_path = target_dir / "._result.zip.partial"
|
|
_download_to_path(job.download_url, zip_path, timeout=timeout)
|
|
md_path = extract_zip_path(job.out_stem, zip_path, output_dir)
|
|
try:
|
|
zip_path.unlink()
|
|
except OSError:
|
|
pass
|
|
if want_markdown and md_path.exists():
|
|
job.result.markdown = md_path.read_text(encoding="utf-8", errors="replace")
|
|
_finalize(job.result, job.out_stem, output_dir, md_path, obsidian,
|
|
started=job.started or None)
|
|
except Exception as exc: # noqa: BLE001 - isolate a single bad result
|
|
job.result.state = STATE_FAILED
|
|
job.result.error = f"{type(exc).__name__}: {exc}"
|
|
|
|
|
|
def _poll_until_done(active, opts, token, output_dir, *, poll_interval, obsidian,
|
|
want_markdown, download_pool, on_done, timeout=60):
|
|
"""Decoupled poll loop: dispatch downloads as results complete.
|
|
|
|
Each batch/task ticket carries its OWN adaptive backoff and next-due time, so
|
|
one batch's cadence never resets or inflates another's. The per-request socket
|
|
timeout is capped (a stalled poll can't wedge the loop for the whole parse
|
|
budget — that lives in ``job.deadline``), and any unexpected error in one
|
|
ticket fails only that ticket instead of aborting the whole batch.
|
|
"""
|
|
poll_req_timeout = min(timeout, REQUEST_TIMEOUT_CAP)
|
|
now0 = time.monotonic()
|
|
groups: dict = {} # poll_id -> {kind, jobs, interval, due}
|
|
for job in active:
|
|
st = groups.setdefault(
|
|
job.poll_id, {"kind": job.poll_kind, "jobs": [], "interval": poll_interval, "due": now0}
|
|
)
|
|
st["jobs"].append(job)
|
|
dl_futures = []
|
|
while groups:
|
|
now = time.monotonic()
|
|
due_ids = [pid for pid, st in groups.items() if st["due"] <= now]
|
|
if not due_ids:
|
|
time.sleep(max(0.0, min(st["due"] for st in groups.values()) - now))
|
|
continue
|
|
for poll_id in due_ids:
|
|
st = groups[poll_id]
|
|
group = st["jobs"]
|
|
try:
|
|
completed, failed = _poll_group(st["kind"], poll_id, group, token, timeout=poll_req_timeout)
|
|
except MinerUError as exc:
|
|
completed = []
|
|
if _is_retryable_api_error(exc):
|
|
failed = [] # transient poll error — try again next cycle
|
|
else:
|
|
failed = list(group)
|
|
for job in failed:
|
|
job.result.error = str(exc)
|
|
except Exception as exc: # noqa: BLE001 - a malformed ticket must not abort the batch
|
|
completed = []
|
|
failed = list(group)
|
|
for job in failed:
|
|
job.result.error = f"poll error: {type(exc).__name__}: {exc}"
|
|
for job in group:
|
|
if job in completed or job in failed:
|
|
continue
|
|
if now > job.deadline:
|
|
job.result.state = STATE_FAILED
|
|
job.result.error = "parse timed out"
|
|
failed.append(job)
|
|
for job in completed:
|
|
group.remove(job)
|
|
dl_futures.append(
|
|
download_pool.submit(
|
|
_download_and_write, job, opts, output_dir,
|
|
obsidian=obsidian, want_markdown=want_markdown, timeout=timeout,
|
|
)
|
|
)
|
|
for job in failed:
|
|
group.remove(job)
|
|
on_done(job)
|
|
progressed = bool(completed or failed)
|
|
if not group:
|
|
del groups[poll_id]
|
|
else:
|
|
st["interval"] = _next_poll_interval(st["interval"], progressed=progressed, base=poll_interval)
|
|
st["due"] = now + st["interval"]
|
|
for future in dl_futures:
|
|
future.result() # _download_and_write swallows its own errors into the result
|
|
# Notify every resolved job after downloads finish (the guard de-dupes earlier
|
|
# poll-failure notifications); includes jobs that failed *inside* the download
|
|
# step, so none are silently left unreported.
|
|
for job in active:
|
|
on_done(job)
|
|
|
|
|
|
def run_pipeline(sources, opts, *, token, output_dir, api, resume, poll_interval,
|
|
timeout, batch_size, workers, obsidian=None, want_markdown=False,
|
|
on_result=None) -> list:
|
|
"""Parse many inputs with decoupled submit/poll/download. Returns ParseResults."""
|
|
batch_mode = len(sources) > 1
|
|
jobs = _plan_jobs(sources, opts, token=token, output_dir=output_dir,
|
|
api=api, resume=resume, batch_mode=batch_mode)
|
|
started = time.monotonic()
|
|
for job in jobs:
|
|
job.started = started
|
|
|
|
def _notify(job):
|
|
if on_result is not None and not getattr(job, "_notified", False):
|
|
job._notified = True
|
|
on_result(job.result)
|
|
|
|
for job in jobs:
|
|
if job.finished: # resume-skipped
|
|
_notify(job)
|
|
|
|
pending = [j for j in jobs if not j.finished]
|
|
std_files = [j for j in pending if j.api == "standard" and not j.is_url]
|
|
std_urls = [j for j in pending if j.api == "standard" and j.is_url]
|
|
batchable_urls = std_urls if _can_batch_standard_urls(opts) else []
|
|
single_urls = [] if _can_batch_standard_urls(opts) else std_urls
|
|
single_url_ids = {id(j) for j in single_urls}
|
|
singles = [
|
|
j for j in pending
|
|
if j.api != "standard" or (j.is_url and id(j) in single_url_ids)
|
|
]
|
|
|
|
if not token:
|
|
for job in pending:
|
|
if job.api == "standard":
|
|
job.result.state = STATE_FAILED
|
|
job.result.error = ("Standard API needs a token — set MINERU_TOKEN "
|
|
"(https://mineru.net/apiManage/token)")
|
|
job.finished = True
|
|
_notify(job)
|
|
std_files = [j for j in std_files if not j.finished]
|
|
batchable_urls = [j for j in batchable_urls if not j.finished]
|
|
single_urls = [j for j in single_urls if not j.finished]
|
|
singles = [j for j in singles if not j.finished]
|
|
|
|
file_batches = _chunk_standard_jobs(std_files, opts, batch_size)
|
|
url_batches = _chunk_standard_jobs(batchable_urls, opts, batch_size)
|
|
|
|
quota_tripped = threading.Event()
|
|
|
|
def _reserve_guarded(fn, *fn_args):
|
|
# Daily quota / retry-limit is terminal for the whole run; once tripped,
|
|
# skip the remaining submits instead of firing more doomed requests.
|
|
if quota_tripped.is_set():
|
|
raise MinerUError(
|
|
f"daily quota reached ({FREE_DAILY_PAGES} free pages/day) — submit skipped; retry tomorrow",
|
|
code=-60018,
|
|
)
|
|
try:
|
|
return fn(*fn_args)
|
|
except MinerUError as exc:
|
|
if exc.code in QUOTA_EXHAUSTED_CODES:
|
|
quota_tripped.set()
|
|
raise
|
|
|
|
with ThreadPoolExecutor(max_workers=workers) as submit_pool, \
|
|
ThreadPoolExecutor(max_workers=workers) as download_pool:
|
|
# Phase 1 — reserve (parallel): one POST per single input / per batch.
|
|
reserve = {}
|
|
for job in singles:
|
|
reserve[submit_pool.submit(_reserve_guarded, _reserve_single, job, opts, token, timeout)] = ("single", job)
|
|
for chunk in file_batches:
|
|
reserve[submit_pool.submit(_reserve_guarded, _reserve_batch, chunk, opts, token, timeout)] = ("batch", chunk)
|
|
for chunk in url_batches:
|
|
reserve[submit_pool.submit(_reserve_guarded, _reserve_url_batch, chunk, opts, token, timeout)] = ("batch", chunk)
|
|
|
|
uploads = []
|
|
for future in as_completed(reserve):
|
|
kind, payload = reserve[future]
|
|
affected = payload if kind == "batch" else [payload]
|
|
try:
|
|
uploads.extend(future.result())
|
|
except Exception as exc: # noqa: BLE001 - submit failure isolated per ticket
|
|
msg = str(exc) if isinstance(exc, MinerUError) else f"{type(exc).__name__}: {exc}"
|
|
for job in affected:
|
|
if job.result.state != STATE_DONE:
|
|
job.result.state = STATE_FAILED
|
|
job.result.error = f"submit failed: {msg}"
|
|
job.finished = True
|
|
_notify(job)
|
|
|
|
# Phase 2 — upload (parallel): PUT each reserved file to its signed URL.
|
|
up_futures = {}
|
|
for job, url in uploads:
|
|
up_futures[submit_pool.submit(_put_file, url, job.source, timeout)] = job
|
|
for future in as_completed(up_futures):
|
|
job = up_futures[future]
|
|
try:
|
|
future.result()
|
|
except Exception as exc: # noqa: BLE001
|
|
job.result.state = STATE_FAILED
|
|
job.result.error = f"upload failed: {exc}"
|
|
job.finished = True
|
|
_notify(job)
|
|
|
|
# Phase 3 — poll + download (decoupled). Gate on the ``finished`` flag, NOT
|
|
# result.state: a freshly submitted job's ParseResult still carries its default
|
|
# ``failed`` state, so a state-based filter would drop every in-flight job and
|
|
# skip polling entirely — reporting the whole batch as failed.
|
|
active = [j for j in pending if j.poll_id and not j.finished]
|
|
if active:
|
|
_poll_until_done(
|
|
active, opts, token, output_dir, poll_interval=poll_interval,
|
|
obsidian=obsidian, want_markdown=want_markdown,
|
|
download_pool=download_pool, on_done=_notify, timeout=timeout,
|
|
)
|
|
|
|
return [job.result for job in jobs]
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Input expansion + CLI
|
|
# --------------------------------------------------------------------------- #
|
|
_IMG_REF = re.compile(r"(!\[[^\]]*\]\()([^)\s]+)(\))")
|
|
|
|
|
|
def _load_splitter():
|
|
script_dir = str(Path(__file__).resolve().parent)
|
|
if script_dir not in sys.path:
|
|
sys.path.insert(0, script_dir)
|
|
import splitter
|
|
return splitter
|
|
|
|
|
|
def _load_local_engine():
|
|
"""Import the optional offline-engine module; return it or None."""
|
|
try:
|
|
script_dir = str(Path(__file__).resolve().parent)
|
|
if script_dir not in sys.path:
|
|
sys.path.insert(0, script_dir)
|
|
import local_engine
|
|
return local_engine
|
|
except Exception: # pragma: no cover
|
|
return None
|
|
|
|
|
|
def split_cap(token, api, override=None) -> int:
|
|
"""Pages per part: explicit override, else the cap of the backend that will run."""
|
|
if override:
|
|
return override
|
|
if api == "standard" or (api == "auto" and token):
|
|
return STANDARD_MAX_PAGES
|
|
return AGENT_MAX_PAGES
|
|
|
|
|
|
def _merge_parts(part_results, stem: str, final_dir: Path) -> tuple:
|
|
"""Merge part Markdown + images into ``final_dir``; return (markdown, image_count)."""
|
|
images_dir = final_dir / "images"
|
|
bodies = []
|
|
image_count = 0
|
|
for n, res in enumerate(part_results, start=1):
|
|
if res.state != STATE_DONE or not res.markdown:
|
|
continue
|
|
part_md_dir = Path(res.markdown_path).parent if res.markdown_path else None
|
|
prefix = f"part{n:03d}_"
|
|
|
|
def repl(match, _dir=part_md_dir, _pfx=prefix):
|
|
nonlocal image_count
|
|
ref = match.group(2)
|
|
if ref.startswith("http://") or ref.startswith("https://") or _dir is None:
|
|
return match.group(0)
|
|
src = (_dir / ref)
|
|
if not src.is_file():
|
|
return match.group(0)
|
|
images_dir.mkdir(parents=True, exist_ok=True)
|
|
new_name = _pfx + Path(ref).name
|
|
(images_dir / new_name).write_bytes(src.read_bytes())
|
|
image_count += 1
|
|
return f"{match.group(1)}images/{new_name}{match.group(3)}"
|
|
|
|
bodies.append(_IMG_REF.sub(repl, res.markdown))
|
|
merged = ("\n\n---\n\n").join(bodies)
|
|
final_dir.mkdir(parents=True, exist_ok=True)
|
|
(final_dir / f"{stem}.md").write_text(merged, encoding="utf-8")
|
|
return merged, image_count
|
|
|
|
|
|
def process_split(source, opts, *, token, output_dir, api, resume, timeout, cap, engine="cloud", out_stem=None):
|
|
"""Split an oversized local PDF, parse each part, and merge. Returns a ParseResult,
|
|
or None when no split is needed (caller falls back to process_one)."""
|
|
if is_url(source) or suffix_of(source) != ".pdf":
|
|
return None
|
|
stem = safe_stem(source)
|
|
out_stem = out_stem or stem
|
|
result = ParseResult(name=stem, source=source, modality="pdf")
|
|
try:
|
|
splitter = _load_splitter()
|
|
pages = splitter.pdf_page_count(source)
|
|
except Exception as exc: # SplitError (pypdf missing) or unreadable PDF
|
|
result.state = STATE_FAILED
|
|
result.error = str(exc)
|
|
return result
|
|
if pages <= cap:
|
|
return None # fits — let normal processing handle it
|
|
|
|
final_dir = output_dir / out_stem
|
|
if resume and (final_dir / f"{out_stem}.md").exists():
|
|
result.state = "skipped"
|
|
result.output_dir = str(final_dir)
|
|
result.markdown_path = str(final_dir / f"{out_stem}.md")
|
|
return result
|
|
|
|
started = time.monotonic()
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
parts = splitter.split_pdf(source, cap, tmp)
|
|
part_out = Path(tmp) / "out"
|
|
part_results = [
|
|
process_one(str(p), opts, token=token, output_dir=part_out,
|
|
api=api, resume=False, timeout=timeout, engine=engine)
|
|
for p in parts
|
|
]
|
|
failed = [r for r in part_results if r.state == STATE_FAILED]
|
|
if failed:
|
|
result.state = STATE_FAILED
|
|
result.error = f"part failed: {failed[0].error}"
|
|
return result
|
|
merged, n_images = _merge_parts(part_results, out_stem, final_dir)
|
|
|
|
result.state = STATE_DONE
|
|
result.api = part_results[0].api if part_results else api
|
|
result.output_dir = str(final_dir)
|
|
result.markdown_path = str(final_dir / f"{out_stem}.md")
|
|
result.markdown = merged
|
|
result.elapsed = round(time.monotonic() - started, 2)
|
|
result.task_id = f"split:{len(parts)}parts"
|
|
return result
|
|
|
|
|
|
def expand_inputs(raw_inputs) -> list:
|
|
"""Expand directories into supported files; pass through URLs and files.
|
|
|
|
De-duplicates while preserving order: identical files (by resolved real path)
|
|
and repeated URLs collapse to one, so a file passed twice — or matched by both
|
|
an explicit path and a directory scan — is parsed once, not N times.
|
|
"""
|
|
expanded, seen = [], set()
|
|
|
|
def _add(display, key):
|
|
if key not in seen:
|
|
seen.add(key)
|
|
expanded.append(display)
|
|
|
|
for item in raw_inputs:
|
|
if is_url(item):
|
|
_add(item, item)
|
|
continue
|
|
path = Path(item)
|
|
if path.is_dir():
|
|
for child in sorted(path.iterdir()):
|
|
if child.is_file() and is_supported(child.name):
|
|
_add(str(child), str(child.resolve()))
|
|
else:
|
|
key = str(path.resolve()) if path.exists() else str(path)
|
|
_add(item, key)
|
|
return expanded
|
|
|
|
|
|
def build_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(
|
|
prog="mineru",
|
|
description="Parse PDF / Office / image files into Markdown via MinerU.",
|
|
)
|
|
parser.add_argument("inputs", nargs="*", help="File(s), a directory, or a URL")
|
|
parser.add_argument("--output", "-o", default="./output", help="Output directory (default: ./output)")
|
|
parser.add_argument("--token", help="MinerU API token (or set MINERU_TOKEN)")
|
|
parser.add_argument("--api", choices=["auto", "agent", "standard"], default="auto",
|
|
help="Cloud backend selection (default: auto)")
|
|
parser.add_argument("--engine", choices=["cloud", "local", "auto"], default="cloud",
|
|
help="cloud (MinerU API) | local (offline pymupdf4llm, born-digital PDFs) | "
|
|
"auto (local if the PDF has text, else cloud). Default: cloud")
|
|
parser.add_argument("--model", choices=["pipeline", "vlm", "MinerU-HTML"], default="vlm",
|
|
help="Standard API model (default: vlm)")
|
|
parser.add_argument("--format", dest="formats", action="append", default=[],
|
|
choices=["docx", "html", "latex"], help="Extra export format (repeatable; forces Standard API)")
|
|
parser.add_argument("--lang", default="ch", help="Document language code (default: ch)")
|
|
parser.add_argument("--ocr", action="store_true", help="Enable OCR for scanned documents")
|
|
parser.add_argument("--no-formula", action="store_true", help="Disable formula recognition")
|
|
parser.add_argument("--no-table", action="store_true", help="Disable table recognition")
|
|
parser.add_argument("--pages", help="Page range, e.g. '1-10' or '2,4-6' (Standard only)")
|
|
parser.add_argument("--workers", "-w", type=int, default=DEFAULT_WORKERS,
|
|
help=f"Concurrent submit/upload/download slots (default: {DEFAULT_WORKERS})")
|
|
parser.add_argument("--batch-size", type=int, default=BATCH_MAX_FILES,
|
|
help=f"Max files per Standard batch submit (default/max: {BATCH_MAX_FILES})")
|
|
parser.add_argument("--poll-interval", type=float, default=DEFAULT_POLL_INTERVAL,
|
|
help=f"Seconds between status polls (default: {DEFAULT_POLL_INTERVAL}, adaptive backoff)")
|
|
parser.add_argument("--resume", action="store_true", help="Skip inputs already parsed")
|
|
parser.add_argument("--obsidian", help="Shortcut for --to obsidian with this vault path")
|
|
parser.add_argument("--to", dest="to", action="append", default=[], metavar="SINK",
|
|
help="Deliver parsed Markdown to a content tool (repeatable): "
|
|
"obsidian, logseq, siyuan, notion, linear, yuque, coda, slack, "
|
|
"feishu, confluence, onenote, ticktick, dingtalk, airtable, wecom")
|
|
parser.add_argument("--list-sinks", action="store_true", help="List available delivery targets and exit")
|
|
parser.add_argument("--doctor", action="store_true", help="Run an environment self-check and exit")
|
|
parser.add_argument("--chunk", action="store_true", help="Also emit heading-aware RAG chunks (JSON sidecar + --json)")
|
|
parser.add_argument("--chunk-size", type=int, default=2000, help="Max characters per chunk (default: 2000)")
|
|
parser.add_argument("--split", action="store_true",
|
|
help="Split oversized PDFs past the page caps, parse parts, merge (needs pypdf)")
|
|
parser.add_argument("--split-pages", type=int, help="Pages per split part (default: backend cap, 20 or 200)")
|
|
parser.add_argument("--stdout", action="store_true", help="Print Markdown to stdout (single input)")
|
|
parser.add_argument("--json", dest="as_json", action="store_true", help="Print machine-readable status to stdout")
|
|
parser.add_argument("--timeout", type=int, default=600, help="Per-input timeout in seconds (default: 600)")
|
|
parser.add_argument("--quiet", "-q", action="store_true", help="Suppress progress output")
|
|
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
return parser
|
|
|
|
|
|
def options_from_args(args) -> ParseOptions:
|
|
return ParseOptions(
|
|
model=args.model,
|
|
language=args.lang,
|
|
is_ocr=args.ocr,
|
|
enable_formula=not args.no_formula,
|
|
enable_table=not args.no_table,
|
|
page_ranges=args.pages,
|
|
extra_formats=tuple(args.formats),
|
|
)
|
|
|
|
|
|
def _log(message, *, quiet):
|
|
if not quiet:
|
|
print(message, file=sys.stderr, flush=True)
|
|
|
|
|
|
def _load_sinks():
|
|
"""Import the optional ``sinks`` delivery package; return the module or None."""
|
|
try:
|
|
script_dir = str(Path(__file__).resolve().parent)
|
|
if script_dir not in sys.path:
|
|
sys.path.insert(0, script_dir)
|
|
import sinks
|
|
return sinks
|
|
except Exception: # pragma: no cover - sinks are optional
|
|
return None
|
|
|
|
|
|
def _print_sinks() -> int:
|
|
sinks = _load_sinks()
|
|
if sinks is None:
|
|
print("Delivery sinks unavailable (scripts/sinks not importable).", file=sys.stderr)
|
|
return 1
|
|
print("Available delivery targets (use --to NAME, repeatable):\n")
|
|
for name in sinks.sink_names():
|
|
sink = sinks.get_sink(name)
|
|
req = ", ".join(sink.requires) if sink.requires else "(no config needed)"
|
|
print(f" {name:11} — {sink.label}\n{'':14}env: {req}")
|
|
return 0
|
|
|
|
|
|
def _check_network() -> tuple:
|
|
try:
|
|
status, _ = _http("GET", "https://mineru.net/", timeout=8)
|
|
return True, f"reachable (HTTP {status})"
|
|
except Exception as exc: # noqa: BLE001
|
|
return False, f"unreachable ({type(exc).__name__})"
|
|
|
|
|
|
def _check_token(token: str) -> tuple:
|
|
try:
|
|
_api_json("POST", f"{STANDARD_API}/extract/task", token=token, payload={})
|
|
return True, "accepted"
|
|
except MinerUError as exc:
|
|
if exc.code in ("A0202", "A0211"):
|
|
return False, f"invalid/expired ({exc.code}) — refresh at https://mineru.net/apiManage/token"
|
|
return True, "accepted (token authenticates; a parameter error is expected here)"
|
|
except Exception as exc: # noqa: BLE001
|
|
return False, f"check failed ({type(exc).__name__})"
|
|
|
|
|
|
def _module_present(module: str) -> bool:
|
|
import importlib.util
|
|
try:
|
|
return importlib.util.find_spec(module) is not None
|
|
except (ImportError, ValueError):
|
|
return False
|
|
|
|
|
|
def _doctor(as_json: bool = False) -> int:
|
|
"""Environment self-check: Python, API reachability, token, optional extras, sinks."""
|
|
import platform
|
|
|
|
py_ok = sys.version_info >= (3, 8)
|
|
net_ok, net_detail = _check_network()
|
|
token = os.environ.get("MINERU_TOKEN")
|
|
if not token:
|
|
tok_ok, tok_detail = True, "not set (Agent API works token-free)"
|
|
else:
|
|
tok_ok, tok_detail = _check_token(token)
|
|
|
|
extras = {
|
|
"pypdf (--split)": _module_present("pypdf"),
|
|
"pymupdf4llm (--engine local)": _module_present("pymupdf4llm"),
|
|
"html-for-docx (wps sink)": _module_present("html4docx"),
|
|
"roam-client (roam sink)": _module_present("roam_client"),
|
|
}
|
|
|
|
sinks = _load_sinks()
|
|
if sinks is not None:
|
|
names = sinks.sink_names()
|
|
configured = [n for n in names if sinks.get_sink(n).is_configured()]
|
|
sinks_detail = f"{len(names)} registered · {len(configured)} configured"
|
|
else:
|
|
sinks_detail = "unavailable"
|
|
|
|
report = {
|
|
"version": __version__,
|
|
"python": {"ok": py_ok, "detail": platform.python_version()},
|
|
"network": {"ok": net_ok, "detail": net_detail},
|
|
"token": {"ok": tok_ok, "detail": tok_detail},
|
|
"optional_extras": extras,
|
|
"sinks": sinks_detail,
|
|
"healthy": py_ok and net_ok and tok_ok,
|
|
}
|
|
|
|
if as_json:
|
|
print(json.dumps(report, ensure_ascii=False, indent=2))
|
|
else:
|
|
mark = lambda ok: "✅" if ok else "❌" # noqa: E731
|
|
print(f"MinerU Skill doctor (v{__version__})\n")
|
|
print(f" {mark(py_ok)} Python {report['python']['detail']}")
|
|
print(f" {mark(net_ok)} MinerU API {net_detail}")
|
|
print(f" {mark(tok_ok)} MINERU_TOKEN {tok_detail}")
|
|
print(" · Optional extras:")
|
|
for label, present in extras.items():
|
|
print(f" {mark(present)} {label}")
|
|
print(f" · Sinks: {sinks_detail}")
|
|
print(f"\n{'✅ healthy' if report['healthy'] else '❌ issues found'}")
|
|
return 0 if report["healthy"] else 1
|
|
|
|
|
|
def _deliver(results, names, sinks, *, quiet):
|
|
"""Deliver each completed result's Markdown to the requested sinks."""
|
|
for res in results:
|
|
if res.state != STATE_DONE or not res.markdown:
|
|
continue
|
|
doc = sinks.ParsedDoc(
|
|
title=res.name, markdown=res.markdown, source=res.source,
|
|
modality=res.modality, markdown_path=res.markdown_path,
|
|
)
|
|
for outcome in sinks.deliver_all(doc, names):
|
|
res.sinks.append(outcome.to_status())
|
|
if outcome.ok:
|
|
_log(f" 📤 {res.name} → {outcome.sink}: {outcome.url or outcome.detail or 'ok'}",
|
|
quiet=quiet)
|
|
else:
|
|
_log(f" ⚠️ {res.name} → {outcome.sink}: {outcome.error}", quiet=quiet)
|
|
|
|
|
|
def _chunk_results(results, *, max_chars, quiet):
|
|
"""Attach heading-aware RAG chunks to each result and write a JSON sidecar."""
|
|
try:
|
|
script_dir = str(Path(__file__).resolve().parent)
|
|
if script_dir not in sys.path:
|
|
sys.path.insert(0, script_dir)
|
|
import chunking
|
|
except Exception: # pragma: no cover - chunking is stdlib, should always import
|
|
_log("⚠️ --chunk requested but the chunking module is unavailable.", quiet=quiet)
|
|
return
|
|
for res in results:
|
|
if res.state != STATE_DONE or not res.markdown:
|
|
continue
|
|
res.chunks = chunking.chunk_markdown(res.markdown, max_chars=max_chars, source=res.source)
|
|
if res.markdown_path:
|
|
sidecar = Path(res.markdown_path).with_suffix(".chunks.json")
|
|
try:
|
|
sidecar.write_text(json.dumps(res.chunks, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
_log(f" 🧩 {res.name}: {len(res.chunks)} chunk(s) → {sidecar}", quiet=quiet)
|
|
except OSError:
|
|
pass
|
|
|
|
|
|
def main(argv=None) -> int:
|
|
args = build_parser().parse_args(argv)
|
|
|
|
if args.doctor:
|
|
return _doctor(as_json=args.as_json)
|
|
|
|
if args.list_sinks:
|
|
return _print_sinks()
|
|
|
|
if args.obsidian:
|
|
os.environ["OBSIDIAN_VAULT"] = str(Path(args.obsidian).expanduser())
|
|
if "obsidian" not in args.to:
|
|
args.to.append("obsidian")
|
|
|
|
token = args.token or os.environ.get("MINERU_TOKEN")
|
|
opts = options_from_args(args)
|
|
output_dir = Path(args.output)
|
|
|
|
sources = expand_inputs(args.inputs)
|
|
if not sources:
|
|
_log("No supported inputs found.", quiet=args.quiet)
|
|
return 1
|
|
|
|
unsupported = [s for s in sources if not is_url(s) and not is_supported(s)]
|
|
if unsupported:
|
|
_log(f"Unsupported file type(s): {', '.join(unsupported)}", quiet=args.quiet)
|
|
return 1
|
|
|
|
if (args.stdout or args.as_json) and len(sources) > 1:
|
|
# Keep stdout machine-clean: route progress to stderr only (already does).
|
|
pass
|
|
|
|
workers = max(1, min(args.workers, len(sources)))
|
|
_log(
|
|
f"📚 {len(sources)} input(s) · workers={workers} · "
|
|
f"{'token set' if token else 'no token (Agent API)'}",
|
|
quiet=args.quiet,
|
|
)
|
|
|
|
cap = split_cap(token, args.api, args.split_pages)
|
|
|
|
def log_result(res):
|
|
icon = {"done": "✅", "skipped": "⏭️", "failed": "❌"}.get(res.state, "•")
|
|
timing = f" ({res.elapsed}s)" if res.elapsed else ""
|
|
_log(
|
|
f" {icon} [{res.api}/{res.modality}] {res.name}{timing}"
|
|
+ (f" — {res.error}" if res.error else ""),
|
|
quiet=args.quiet,
|
|
)
|
|
|
|
results: list = []
|
|
use_pipeline = args.engine == "cloud" and not args.split and len(sources) > 1
|
|
|
|
if use_pipeline:
|
|
results = run_pipeline(
|
|
sources,
|
|
opts,
|
|
token=token,
|
|
output_dir=output_dir,
|
|
api=args.api,
|
|
resume=args.resume,
|
|
poll_interval=args.poll_interval,
|
|
timeout=args.timeout,
|
|
batch_size=args.batch_size,
|
|
workers=workers,
|
|
want_markdown=bool(args.to) or args.chunk or args.stdout,
|
|
on_result=log_result,
|
|
)
|
|
else:
|
|
out_stem_of = dict(zip(sources, unique_out_stems(sources)))
|
|
|
|
def run(source):
|
|
res = None
|
|
if args.split and args.engine != "local":
|
|
res = process_split(
|
|
source, opts, token=token, output_dir=output_dir, api=args.api,
|
|
resume=args.resume, timeout=args.timeout, cap=cap, engine=args.engine,
|
|
out_stem=out_stem_of[source],
|
|
)
|
|
if res is None:
|
|
res = process_one(
|
|
source, opts, token=token, output_dir=output_dir, api=args.api,
|
|
obsidian=None, resume=args.resume, timeout=args.timeout, engine=args.engine,
|
|
out_stem=out_stem_of[source],
|
|
)
|
|
log_result(res)
|
|
return res
|
|
|
|
if workers == 1:
|
|
results = [run(s) for s in sources]
|
|
else:
|
|
with ThreadPoolExecutor(max_workers=workers) as pool:
|
|
futures = {pool.submit(run, s): s for s in sources}
|
|
for future in as_completed(futures):
|
|
results.append(future.result())
|
|
|
|
done = [r for r in results if r.state == STATE_DONE]
|
|
skipped = [r for r in results if r.state == "skipped"]
|
|
failed = [r for r in results if r.state == STATE_FAILED]
|
|
|
|
if args.to:
|
|
sinks = _load_sinks()
|
|
if sinks is None:
|
|
_log("⚠️ --to requested but the sinks package is unavailable.", quiet=args.quiet)
|
|
else:
|
|
_deliver(results, args.to, sinks, quiet=args.quiet)
|
|
|
|
if args.chunk:
|
|
_chunk_results(results, max_chars=args.chunk_size, quiet=args.quiet)
|
|
|
|
if args.as_json:
|
|
print(json.dumps({
|
|
"total": len(results),
|
|
"done": len(done),
|
|
"skipped": len(skipped),
|
|
"failed": len(failed),
|
|
"results": [r.to_status() for r in results],
|
|
}, ensure_ascii=False, indent=2))
|
|
elif args.stdout:
|
|
for r in results:
|
|
if r.markdown is not None:
|
|
print(r.markdown)
|
|
|
|
_log(
|
|
f"\n{'='*48}\n✅ {len(done)} · ⏭️ {len(skipped)} · ❌ {len(failed)}"
|
|
+ (f"\n📁 {output_dir}" if done else ""),
|
|
quiet=args.quiet,
|
|
)
|
|
if failed:
|
|
_log("Failed: " + ", ".join(f"{r.name} ({r.error})" for r in failed), quiet=args.quiet)
|
|
return 1 if failed else 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|