From 96585886f831185dcf02187e42fd3e1b89b3ea4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Tue, 26 May 2026 20:32:41 +0800 Subject: [PATCH 1/5] skill categroy --- skills/common/data-dashboard/.claude-plugin/plugin.json | 2 +- skills/common/mcp-ui/.claude-plugin/plugin.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/skills/common/data-dashboard/.claude-plugin/plugin.json b/skills/common/data-dashboard/.claude-plugin/plugin.json index 4deb6cc..8ed7587 100644 --- a/skills/common/data-dashboard/.claude-plugin/plugin.json +++ b/skills/common/data-dashboard/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "data-dashboard", "description": "Renders data as an interactive dashboard card UI using the mcp-ui protocol.", - "category": "Data & Retrieval", + "category": "Interactive UI", "hooks": { "PrePrompt": [ { diff --git a/skills/common/mcp-ui/.claude-plugin/plugin.json b/skills/common/mcp-ui/.claude-plugin/plugin.json index 5825549..a90ec7c 100644 --- a/skills/common/mcp-ui/.claude-plugin/plugin.json +++ b/skills/common/mcp-ui/.claude-plugin/plugin.json @@ -19,5 +19,5 @@ ] } }, - "category": "Data & Retrieval" + "category": "Interactive UI" } From 1925de03555365149874fe9d88a89f8e624e5355 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Mon, 1 Jun 2026 11:51:21 +0800 Subject: [PATCH 2/5] add feature memory --- .features/memory/MEMORY.md | 55 +++++++++++++++++++ .features/memory/changelog/2026-Q2.md | 6 ++ .../decisions/2026-06-connection-pool.md | 25 +++++++++ .../decisions/2026-06-custom-embedding.md | 22 ++++++++ .features/memory/docs/.gitkeep | 0 .features/thinking/MEMORY.md | 52 ++++++++++++++++++ .features/thinking/changelog/2026-Q2.md | 7 +++ ...2026-06-middleware-not-native-reasoning.md | 28 ++++++++++ 8 files changed, 195 insertions(+) create mode 100644 .features/memory/MEMORY.md create mode 100644 .features/memory/changelog/2026-Q2.md create mode 100644 .features/memory/decisions/2026-06-connection-pool.md create mode 100644 .features/memory/decisions/2026-06-custom-embedding.md create mode 100644 .features/memory/docs/.gitkeep create mode 100644 .features/thinking/MEMORY.md create mode 100644 .features/thinking/changelog/2026-Q2.md create mode 100644 .features/thinking/decisions/2026-06-middleware-not-native-reasoning.md diff --git a/.features/memory/MEMORY.md b/.features/memory/MEMORY.md new file mode 100644 index 0000000..b44022f --- /dev/null +++ b/.features/memory/MEMORY.md @@ -0,0 +1,55 @@ +--- +feature: "memory" +scope: "Agent 长期记忆能力(基于 Mem0 + pgvector),跨会话回忆与事实提取存储" +updated_at: "2026-06-01" +status: active +--- + +# Memory(记忆功能) + +## 当前状态 +Agent 的长期记忆能力,底层使用 **Mem0** 库 + **pgvector**(PostgreSQL 向量存储)。 +在 agent 执行前 `recall` 相关记忆并注入 system prompt,在执行后于后台线程异步提取并存储新事实。 +按 `(user_id, agent_id)` 多租户隔离,每个 `agent_id` 一张 `mem0_{agent_id}` 集合表。 + +> 注意:API/配置字段历史上叫 `memori`,为兼容性保留命名,内部实际用的是 **Mem0**。 + +## 配置开关 +| 层级 | 字段 | 默认 | 位置 | +|------|------|------|------| +| 全局总开关 | `MEM0_ENABLED` (env) | `true` | `utils/settings.py:80` | +| Agent 配置 | `enable_memori: bool` | `False` | `agent/agent_config.py:47` | +| API 请求 | `enable_memory: bool` | `False` | `utils/api_models.py:56` | +| 召回数量 | `memori_semantic_search_top_k: int` | `20` | `agent/agent_config.py:48` | +| 召回数量(env) | `MEM0_SEMANTIC_SEARCH_TOP_K` | `20` | `utils/settings.py:84` | +| 连接池大小 | `MEM0_POOL_SIZE` (env) | `50` | `utils/settings.py:61` | + +开启路径:V1 走请求体 `enable_memory`,V2 走 bot 配置 `enable_memory`;两者都受全局 `MEM0_ENABLED` 限制。 +中间件注册在 `agent/deep_assistant.py:270`(`if config.enable_memori:`)。 + +## 核心文件 +- `agent/mem0_manager.py` — Mem0 客户端管理器:实例创建/LRU 缓存(最多 50)、连接池管理、`recall_memories` / `add_memory` / `delete_all`、多租户隔离、`CustomMem0Embedding`、`json_repair` 补丁 +- `agent/mem0_middleware.py` — 中间件:`before_agent` 召回并写入 `config._mem0_context`(行 114/155);`after_agent` 后台异步提取存储 +- `agent/mem0_config.py` — Mem0 配置类:user/agent/session id、记忆提示模板、自定义提取 prompt 加载(`PreMemoryPrompt` hook) +- `routes/memory.py` — 内存管理 API(GET/POST/DELETE,供前端管理用户记忆) +- `drop_mem0_tables.py` — 清理脚本,删除所有 `mem0_*` 表(重置/清脏数据) + +## 数据流 +**写入**:User+Assistant 消息 → `after_agent`(后台线程)→ `add_memory` → `Mem0.add()`(LLM 提取事实)→ pgvector 向量化存入 `mem0_{agent_id}`。 +**读取**:User query → `before_agent` → `recall_memories` → `Mem0.search()`(向量相似 top_k)→ 格式化后写入 `config._mem0_context` → 注入 system prompt(也供思考功能 [[../thinking/MEMORY|thinking]] 使用)。 + +## 关键设计决策 +- 复用项目已加载的 embedding 模型(`CustomMem0Embedding`),避免 Mem0 重复加载 SentenceTransformer → `decisions/2026-06-custom-embedding.md` +- 连接池主动释放 + LRU 缓存实例,防连接池耗尽 → `decisions/2026-06-connection-pool.md` + +## Gotchas(开发必读) +- **命名陷阱**:配置叫 `enable_memori`(无 y),API 叫 `enable_memory`,内部实现是 Mem0,三个名字别混。 +- **连接池耗尽**:Mem0 PGVector `__init__` 取连接、`__del__` 释放;必须在每次操作后主动 `_release_connection()`,否则高并发会打满 `MEM0_POOL_SIZE`。 +- **JSON 脆弱**:LLM 提取事实返回的 JSON 常有尾逗号/单引号,已 monkey patch 成 `json_repair.loads`,不要改回原生解析。 +- **表膨胀**:每个 `agent_id` 一张表,多 bot 长期运行会产生大量表,定期用 `drop_mem0_tables.py` 清理。 +- **Embedding 维度**:`paraphrase-multilingual-MiniLM-L12-v2`,384 维;换模型需同步 pgvector 列维度,否则写入报错。 + +## 索引 +- 设计决策:`decisions/` +- 变更历史:`changelog/` +- 相关文档:`docs/` diff --git a/.features/memory/changelog/2026-Q2.md b/.features/memory/changelog/2026-Q2.md new file mode 100644 index 0000000..2c382f4 --- /dev/null +++ b/.features/memory/changelog/2026-Q2.md @@ -0,0 +1,6 @@ +# Changelog 2026 Q2 — Memory + +## 2026-06-01 +- 初始化 feature memory 文档。 +- 记录现状:Mem0 + pgvector 长期记忆,`before_agent` 召回注入 / `after_agent` 后台提取存储。 +- 归档设计决策:自定义 embedding 复用(custom-embedding)、连接池主动释放 + LRU(connection-pool)。 diff --git a/.features/memory/decisions/2026-06-connection-pool.md b/.features/memory/decisions/2026-06-connection-pool.md new file mode 100644 index 0000000..5b58c68 --- /dev/null +++ b/.features/memory/decisions/2026-06-connection-pool.md @@ -0,0 +1,25 @@ +--- +date: "2026-06-01" +status: adopted +topic: "connection-pool" +impact: [memory, performance, stability] +--- + +# 连接池主动释放 + Mem0 实例 LRU 缓存 + +## 背景 +Mem0 的 PGVector 后端在实例 `__init__` 时从连接池取一个连接,理论上在 `__del__` 时归还。 +但 Python GC 时机不确定,高并发下连接迟迟不归还会迅速打满 `MEM0_POOL_SIZE`(默认 50),导致后续请求阻塞。 +同时若为每个 `(user_id, agent_id)` 都新建 Mem0 实例且不回收,也会无限占用连接。 + +## 决策 +1. `Mem0Manager` 用 `OrderedDict` 维护最多 50 个 Mem0 实例的 LRU 缓存,超出淘汰最旧的。 +2. 每次记忆操作(recall/add)后调用 `_release_connection()` 立即把连接归还连接池,不等 GC。 + +## 影响 +- 连接池不再被慢 GC 拖垮,高并发稳定。 +- 实例数量有上界,内存可控。 + +## Gotchas +- 不要在操作链路里持有 Mem0 实例的连接跨多个 await,会绕过释放逻辑。 +- LRU 上限(50)与 `MEM0_POOL_SIZE`(50)相关联,调整其一时需一并评估。 diff --git a/.features/memory/decisions/2026-06-custom-embedding.md b/.features/memory/decisions/2026-06-custom-embedding.md new file mode 100644 index 0000000..118a312 --- /dev/null +++ b/.features/memory/decisions/2026-06-custom-embedding.md @@ -0,0 +1,22 @@ +--- +date: "2026-06-01" +status: adopted +topic: "custom-embedding" +impact: [memory, performance] +--- + +# 复用项目 embedding 模型而非 Mem0 自带 SentenceTransformer + +## 背景 +Mem0 默认会自行加载一个 SentenceTransformer 做 embedding。项目本身已经通过 `GlobalModelManager` +加载了 `paraphrase-multilingual-MiniLM-L12-v2`(384 维)。若放任 Mem0 自加载,会出现同一模型在内存中加载两份,浪费显存/内存。 + +## 决策 +在 `agent/mem0_manager.py` 实现 `CustomMem0Embedding`,把 Mem0 的 embedder 接到项目已加载的全局模型上,复用同一份权重。 + +## 影响 +- 内存占用显著下降(不重复加载模型)。 +- embedding 维度固定为 384,与项目主模型一致;换模型时 pgvector 列维度必须同步调整。 + +## 备注 +相关连接池/实例缓存策略见 [[2026-06-connection-pool]]。 diff --git a/.features/memory/docs/.gitkeep b/.features/memory/docs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/.features/thinking/MEMORY.md b/.features/thinking/MEMORY.md new file mode 100644 index 0000000..368cd01 --- /dev/null +++ b/.features/thinking/MEMORY.md @@ -0,0 +1,52 @@ +--- +feature: "thinking" +scope: "Agent 思考功能(基于 GuidelineMiddleware 的前置辅助推理),在主回答前生成一次 内容" +updated_at: "2026-06-01" +status: active +--- + +# Thinking(思考功能) + +## 当前状态 +思考功能通过自定义的 **`GuidelineMiddleware`** 实现:在主 agent 执行前,先用业务指引 prompt 调一次模型做"思考", +把结果包成 `...` 标签并打上 `message_tag: "THINK"` 元数据,供前端识别/折叠展示。 + +> 重要:这是"主请求前的一次辅助请求",**不是** Qwen 模型内置的 reasoning/extended-thinking 模式,因此与具体模型无关,任何 LLM 都能用。对标 OpenAI o1 / Claude thinking,但实现更轻。 + +## 配置开关 +| 层级 | 字段 | 默认 | 位置 | +|------|------|------|------| +| Agent 配置 | `enable_thinking: bool` | `False` | `agent/agent_config.py:26` | +| API 请求 | `enable_thinking: bool` | `False` | `utils/api_models.py:54` | + +开启路径:V1 走请求体 `enable_thinking`,V2 走 bot 配置 `enable_thinking`。 +中间件注册在 `agent/deep_assistant.py:294`:`if config.enable_thinking: middleware.append(GuidelineMiddleware(...))`。 + +## 核心文件 +- `agent/guideline_middleware.py` — 思考主逻辑。`get_guideline_prompt`(行 53+)组装指引 prompt;`before_agent`/`abefore_agent` 调模型生成思考,包 `` 标签并标 `THINK`(行 120-124 / 146-149)。 +- `agent/deep_assistant.py:294-295` — 按 `enable_thinking` 注册中间件。 + +## 数据流 +1. `before_agent` 加载指引(system prompt 中的 Guidelines 块)。 +2. 从 system prompt 提取 guidelines / tool_description / scenarios / terms_list。 +3. 组装 `guideline_prompt` = 业务规则 + 聊天历史 + **记忆上下文** + 工具描述 + 场景 + 术语分析。 +4. 调模型一次:`SystemMessage(guideline_prompt)` + 用户最后一条消息 → 得到思考内容。 +5. 内容包成 `...`,`additional_kwargs["message_tag"] = "THINK"`。 +6. 追加一条空 `HumanMessage`(兼容"最后必须是 user 消息"的模型)。 +7. 主 agent 继续执行,产出正式回答。 + +## 与记忆功能的耦合 +`guideline_middleware.py:63` 读取 `config._mem0_context`(由 [[../memory/MEMORY|memory]] 的 `before_agent` 写入)。 +即:思考阶段会把已召回的长期记忆纳入指引 prompt,从而基于记忆做更好的分析。 +**顺序依赖**:memory 中间件需在 thinking 之前执行,`_mem0_context` 才有值。 + +## Gotchas(开发必读) +- **思考是非流式的**:思考内容在 `before_agent` 一次性完整生成,只有正式回答才流式输出。前端靠 `` 标签 + `message_tag:"THINK"` 折叠展示。 +- **额外一次模型调用**:每次开启都多打一次 LLM 请求,增加延迟和成本,按场景权衡。 +- **不是模型原生 reasoning**:别误以为依赖 `enable_thinking` 透传给 Qwen,它是中间件层的自定义实现。 +- **空 HumanMessage 收尾**:思考消息后会补一条空 user 消息,改消息列表处理逻辑时勿误删。 +- **依赖记忆上下文顺序**:若调整中间件注册顺序,确认 memory 仍在 thinking 之前。 + +## 索引 +- 设计决策:`decisions/` +- 变更历史:`changelog/` diff --git a/.features/thinking/changelog/2026-Q2.md b/.features/thinking/changelog/2026-Q2.md new file mode 100644 index 0000000..b617c21 --- /dev/null +++ b/.features/thinking/changelog/2026-Q2.md @@ -0,0 +1,7 @@ +# Changelog 2026 Q2 — Thinking + +## 2026-06-01 +- 初始化 feature memory 文档。 +- 记录现状:`GuidelineMiddleware` 在 `before_agent` 生成 `` 思考内容,标 `message_tag:"THINK"`。 +- 归档设计决策:用中间件实现而非模型原生 reasoning(middleware-thinking)。 +- 记录与 memory 功能的顺序耦合(依赖 `_mem0_context`)。 diff --git a/.features/thinking/decisions/2026-06-middleware-not-native-reasoning.md b/.features/thinking/decisions/2026-06-middleware-not-native-reasoning.md new file mode 100644 index 0000000..5fd8448 --- /dev/null +++ b/.features/thinking/decisions/2026-06-middleware-not-native-reasoning.md @@ -0,0 +1,28 @@ +--- +date: "2026-06-01" +status: adopted +topic: "middleware-thinking" +impact: [thinking, model-compat] +--- + +# 用中间件实现思考,而非依赖模型原生 reasoning + +## 背景 +"思考功能"可以有两种实现: +A. 透传 `enable_thinking` 给底层模型,依赖模型自带的 reasoning/extended-thinking 能力。 +B. 在主请求前自己加一次"指引思考"的辅助 LLM 调用。 + +模型 A 路线要求底层模型支持原生 reasoning,且不同模型行为/输出格式不一致,难以统一前端处理。 + +## 决策 +采用 B:实现 `GuidelineMiddleware`,在 `before_agent` 阶段用业务指引 prompt 调一次模型生成思考, +统一包成 `...` + `message_tag:"THINK"`。 + +## 影响 +- 与具体模型解耦,任何 LLM(OpenAI/Claude/Qwen)都能用。 +- 思考阶段可注入业务规则、工具描述、术语分析、记忆上下文,可控性强。 +- 代价:每次多一次 LLM 调用(延迟 + 成本);思考内容非流式。 + +## Gotchas +- 思考依赖 `config._mem0_context`,需保证 memory 中间件先于本中间件执行。 +- 思考后补空 `HumanMessage` 以兼容"末条须为 user"的模型,勿删。 From b618cb12d2d9fc945c95da19317d400c81e0e327 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Fri, 5 Jun 2026 14:35:17 +0800 Subject: [PATCH 3/5] add mineru --- skills/developing/mineru/SKILL.md | 49 + .../mineru/references/api_reference.md | 170 ++ .../mineru/references/comparison.md | 193 ++ .../mineru/references/integrations.md | 59 + skills/developing/mineru/scripts/__init__.py | 1 + skills/developing/mineru/scripts/chunking.py | 88 + .../developing/mineru/scripts/local_engine.py | 59 + skills/developing/mineru/scripts/mineru.py | 1996 +++++++++++++++++ .../developing/mineru/scripts/mineru_mcp.py | 178 ++ .../mineru/scripts/sinks/__init__.py | 75 + .../developing/mineru/scripts/sinks/_http.py | 72 + skills/developing/mineru/scripts/sinks/_md.py | 244 ++ .../mineru/scripts/sinks/airtable.py | 50 + .../developing/mineru/scripts/sinks/base.py | 101 + .../developing/mineru/scripts/sinks/coda.py | 72 + .../mineru/scripts/sinks/confluence.py | 66 + .../mineru/scripts/sinks/dingtalk.py | 65 + .../developing/mineru/scripts/sinks/feishu.py | 124 + .../developing/mineru/scripts/sinks/linear.py | 75 + .../developing/mineru/scripts/sinks/local.py | 105 + .../developing/mineru/scripts/sinks/notion.py | 130 ++ .../mineru/scripts/sinks/onenote.py | 66 + .../developing/mineru/scripts/sinks/roam.py | 106 + .../developing/mineru/scripts/sinks/siyuan.py | 111 + .../developing/mineru/scripts/sinks/slack.py | 95 + .../mineru/scripts/sinks/ticktick.py | 48 + .../developing/mineru/scripts/sinks/wecom.py | 60 + skills/developing/mineru/scripts/sinks/wps.py | 104 + .../developing/mineru/scripts/sinks/yuque.py | 65 + skills/developing/mineru/scripts/splitter.py | 64 + 30 files changed, 4691 insertions(+) create mode 100644 skills/developing/mineru/SKILL.md create mode 100644 skills/developing/mineru/references/api_reference.md create mode 100644 skills/developing/mineru/references/comparison.md create mode 100644 skills/developing/mineru/references/integrations.md create mode 100644 skills/developing/mineru/scripts/__init__.py create mode 100644 skills/developing/mineru/scripts/chunking.py create mode 100644 skills/developing/mineru/scripts/local_engine.py create mode 100644 skills/developing/mineru/scripts/mineru.py create mode 100644 skills/developing/mineru/scripts/mineru_mcp.py create mode 100644 skills/developing/mineru/scripts/sinks/__init__.py create mode 100644 skills/developing/mineru/scripts/sinks/_http.py create mode 100644 skills/developing/mineru/scripts/sinks/_md.py create mode 100644 skills/developing/mineru/scripts/sinks/airtable.py create mode 100644 skills/developing/mineru/scripts/sinks/base.py create mode 100644 skills/developing/mineru/scripts/sinks/coda.py create mode 100644 skills/developing/mineru/scripts/sinks/confluence.py create mode 100644 skills/developing/mineru/scripts/sinks/dingtalk.py create mode 100644 skills/developing/mineru/scripts/sinks/feishu.py create mode 100644 skills/developing/mineru/scripts/sinks/linear.py create mode 100644 skills/developing/mineru/scripts/sinks/local.py create mode 100644 skills/developing/mineru/scripts/sinks/notion.py create mode 100644 skills/developing/mineru/scripts/sinks/onenote.py create mode 100644 skills/developing/mineru/scripts/sinks/roam.py create mode 100644 skills/developing/mineru/scripts/sinks/siyuan.py create mode 100644 skills/developing/mineru/scripts/sinks/slack.py create mode 100644 skills/developing/mineru/scripts/sinks/ticktick.py create mode 100644 skills/developing/mineru/scripts/sinks/wecom.py create mode 100644 skills/developing/mineru/scripts/sinks/wps.py create mode 100644 skills/developing/mineru/scripts/sinks/yuque.py create mode 100644 skills/developing/mineru/scripts/splitter.py diff --git a/skills/developing/mineru/SKILL.md b/skills/developing/mineru/SKILL.md new file mode 100644 index 0000000..88d8ba4 --- /dev/null +++ b/skills/developing/mineru/SKILL.md @@ -0,0 +1,49 @@ +--- +name: mineru +description: An AI-Native skill for parsing PDF / Office / image files into Markdown with MinerU — a fast, zero-config document parser for AI agents. Works with NO token via the Agent API and auto-upgrades to the Standard API (token) for large files, batches, and DOCX/HTML/LaTeX export. Use when converting PDF/Word/PPT/Excel/image documents, extracting text/tables/formulas, running OCR, or batch processing. +category: Document Processing +metadata: + author: Nebutra + version: "3.3.1" + argument-hint: +--- + +# MinerU PDF Parser + +Parse PDF, Office, and image documents into structured Markdown via the MinerU API. + +## Quick Start + +```bash +# Zero-config: no token, no install (free Agent API) +python3 "${CLAUDE_PLUGIN_ROOT}/scripts/mineru.py" ./document.pdf --output ./output/ + +# Pipe Markdown back to an agent +python3 "${CLAUDE_PLUGIN_ROOT}/scripts/mineru.py" ./document.pdf --stdout + +# Power mode: token unlocks large files / batch / extra formats +export MINERU_TOKEN="..." # https://mineru.net/apiManage/token +python3 "${CLAUDE_PLUGIN_ROOT}/scripts/mineru.py" ./pdfs/ --output ./output/ --workers 8 --resume +``` + +## Features + +- **Auto-routing**: free Agent API by default, auto-upgrades to the Standard API (token) for large/batch/extra-format jobs +- **Multi-modal**: PDF, images, Word, PPT, Excel, HTML +- **High-performance OCR**: `--ocr` with language selection (`--lang`) +- **Formula & table recognition**: LaTeX formulas, structured tables +- **Multi-format export**: Markdown (default), plus DOCX / HTML / LaTeX +- **AI-Native output**: `--stdout` (Markdown) and `--json` (machine status) +- **Batch + resume**: parallel workers with `--resume` +- **Zero dependencies**: standard library only + +## Authentication + +A token is **optional** — the Agent API works without one. Set a token to unlock +the Standard API (≤ 200 MB / ≤ 200 pages, batch, DOCX/HTML/LaTeX): + +```bash +export MINERU_TOKEN="your-token-here" # https://mineru.net/apiManage/token +``` + +Official API docs: https://mineru.net/apiManage/docs diff --git a/skills/developing/mineru/references/api_reference.md b/skills/developing/mineru/references/api_reference.md new file mode 100644 index 0000000..d0c32f8 --- /dev/null +++ b/skills/developing/mineru/references/api_reference.md @@ -0,0 +1,170 @@ +# MinerU API Reference + +Official docs: https://mineru.net/apiManage/docs · Token: https://mineru.net/apiManage/token + +MinerU exposes **two** document-parsing APIs. This skill auto-routes between them. + +| | 🎯 Standard API | ⚡ Agent API (lightweight) | +|---|---|---| +| Base URL | `https://mineru.net/api/v4` | `https://mineru.net/api/v1/agent` | +| Token | **required** (`Bearer`) | **none** (IP rate-limited) | +| Models | `pipeline` / `vlm` / `MinerU-HTML` | fixed lightweight `pipeline` | +| File size | ≤ 200 MB | ≤ 10 MB | +| Pages | ≤ 200 | ≤ 20 | +| Batch | ≤ 50 per request | single file only | +| Output | zip (Markdown + JSON, optional DOCX/HTML/LaTeX) | Markdown only (CDN link) | +| Designed for | high-accuracy / complex / batch | AI-agent / quick / no-login | + +Free Standard-API quota: **1000 pages/day at highest priority** (overflow is lower priority). + +--- + +## Authentication (Standard API) + +``` +Authorization: Bearer YOUR_API_TOKEN +``` + +Get a token at https://mineru.net/apiManage/token. + +> **Response envelopes.** Business endpoints return `{"code":0,"data":{…},"msg":"ok"}`. +> The auth/gateway layer returns a *different* shape on failure: +> `{"success":false,"msgCode":"A0202","msg":"user authenticate failed"}`. +> Clients must handle both — this skill maps `msgCode` to the same error hints. + +--- + +## Standard API endpoints (`/api/v4`) + +### Single URL — `POST /extract/task` + +```json +{ + "url": "https://example.com/doc.pdf", + "model_version": "vlm", + "is_ocr": false, + "enable_formula": true, + "enable_table": true, + "language": "ch", + "page_ranges": "1-10", + "extra_formats": ["docx", "html"], + "data_id": "my-document" +} +``` +Response → `{ "code": 0, "data": { "task_id": "…" } }`. HTML inputs require `model_version: "MinerU-HTML"`. + +### Get task result — `GET /extract/task/{task_id}` + +```json +{ "code": 0, "data": { "task_id": "…", "state": "done", "full_zip_url": "https://…", "err_msg": "" } } +``` + +### Batch local upload — `POST /file-urls/batch` + +Returns signed upload URLs; PUT each file (no `Content-Type`). Up to **50** files / request. + +```json +{ "files": [ { "name": "doc.pdf", "data_id": "doc" } ], "model_version": "vlm" } +``` +Response → `{ "code": 0, "data": { "batch_id": "…", "file_urls": ["https://…"] } }`. + +### Batch URL — `POST /extract/task/batch` + +```json +{ "files": [ { "url": "https://…/doc.pdf", "data_id": "doc" } ], "model_version": "vlm" } +``` + +### Batch results — `GET /extract-results/batch/{batch_id}` + +```json +{ "code": 0, "data": { "batch_id": "…", "extract_result": [ + { "file_name": "doc.pdf", "state": "done", "full_zip_url": "https://…" } +] } } +``` + +--- + +## Agent API endpoints (`/api/v1/agent`) — no token + +### URL — `POST /parse/url` + +```json +{ "url": "https://…/doc.pdf", "language": "ch", "enable_table": true, "is_ocr": false, "enable_formula": true, "page_range": "1-10" } +``` +`page_range` accepts `from-to` or a single page only (no commas). Returns `{ "code": 0, "data": { "task_id": "…" } }`. + +### File — `POST /parse/file` + +```json +{ "file_name": "doc.pdf", "language": "ch" } +``` +Response → `{ "data": { "task_id": "…", "file_url": "https://oss…" } }`; PUT the file to `file_url`. + +### Result — `GET /parse/{task_id}` + +```json +{ "code": 0, "data": { "task_id": "…", "state": "done", "markdown_url": "https://cdn…/full.md" } } +``` + +--- + +## Task states + +`pending` (queued) · `running` (parsing) · `converting` (format conversion) · +`uploading` (downloading source, Agent) · `waiting-file` (awaiting upload) · +`done` (complete) · `failed` (error). + +--- + +## Parameters + +| Parameter | Type | Default | Notes | +|-----------|------|---------|-------| +| `model_version` | string | `pipeline` | `pipeline`, `vlm` (recommended), `MinerU-HTML` (HTML only) | +| `is_ocr` | bool | `false` | OCR for scanned docs (pipeline/vlm) | +| `enable_formula` | bool | `true` | Formula recognition | +| `enable_table` | bool | `true` | Table recognition | +| `language` | string | `ch` | OCR language (see official `language` table) | +| `page_ranges` | string | all | Standard: `"2,4-6"`; Agent `page_range`: `"1-10"` only | +| `extra_formats` | array | `[]` | `docx` / `html` / `latex` (Standard only) | +| `data_id` | string | – | `[A-Za-z0-9_.-]`, ≤ 128 chars | +| `no_cache` | bool | `false` | Bypass URL cache (Standard) | +| `cache_tolerance` | int | `900` | Cache TTL seconds (Standard) | + +--- + +## Limits + +| | Standard | Agent | +|---|---|---| +| File size | 200 MB | 10 MB | +| Pages | 200 | 20 | +| Batch | 50 / request | 1 | +| Quota | 1000 pages/day priority | IP rate-limited (HTTP 429) | + +Supported types: PDF, images (png/jpg/jpeg/jp2/webp/gif/bmp), Doc(x), Ppt(x), Xls(x); HTML is Standard-only. + +--- + +## Error codes + +| Code | Meaning | +|------|---------| +| `A0202` | Invalid token | +| `A0211` | Token expired | +| `-500` | Parameter error | +| `-10001` / `-10002` | Service error / invalid params | +| `-60002` | Unsupported file format | +| `-60003` / `-60004` | File read failed / empty file | +| `-60005` | File too large (> 200 MB) | +| `-60006` | Too many pages (> 200) | +| `-60008` | File read timeout (URL unreachable) | +| `-60010` | Parse failed | +| `-60015` / `-60016` | File / format conversion failed | +| `-60018` | Daily quota reached | +| `-60022` | Web page read failed (rate-limited) | +| **Agent API** | | +| `-30001` | Exceeds Agent 10 MB limit → use Standard API | +| `-30002` | Unsupported file type for Agent | +| `-30003` | Exceeds Agent 20-page limit → use Standard API or `--pages` | +| `-30004` | Invalid request parameters | diff --git a/skills/developing/mineru/references/comparison.md b/skills/developing/mineru/references/comparison.md new file mode 100644 index 0000000..5e761c9 --- /dev/null +++ b/skills/developing/mineru/references/comparison.md @@ -0,0 +1,193 @@ + + +# MinerU Skill — Competitive Comparison Reference + +This document gives an honest, sourced, per-tool breakdown of how **MinerU Skill** compares to the document-parsing landscape. Read the framing first: it determines how to interpret every "we win / they win" below. + +## What MinerU Skill actually is (and is not) + +MinerU Skill is a **zero-config, zero-dependency, agent-native convenience layer over [MinerU](https://github.com/opendatalab/MinerU)'s cloud API**, plus 17 turnkey delivery integrations to note/knowledge/content tools. Concretely (verified in this repo): + +- Core script `scripts/mineru.py` is **~54KB / ~1,350 lines of pure Python standard library** — no `requests`/`aiohttp`, no model weights. +- A **genuinely token-free** default: the free **Agent API** path (`agent_parse` → `_agent_poll`) sends **no `Authorization` header** (the Bearer header is set only when a token is present). Files ≤10MB / ≤20 pages. +- **Auto-routing**: with a token, large/batched/extra-format jobs use the **Standard API** (≤200MB / ≤200 pages); the Agent path **auto-escalates** to Standard on size/page limits. +- **17 delivery sinks** (16 sink modules + `local.py` registering both `obsidian` and `logseq`): obsidian, logseq, siyuan, notion, confluence, onenote, coda, yuque, feishu, slack, dingtalk, wecom, ticktick, linear, airtable — all zero-dependency — plus **roam** (needs `roam-client`) and **wps** (needs `html-for-docx`) which lazy-load one library only when used. +- `--resume` dedup, parallel `--workers` (ThreadPoolExecutor), `--stdout`/`--json` agent output. + +**Critical dependency:** our accuracy is **entirely downstream of, and capped by, what MinerU's cloud serves.** We own no models. Therefore: + +- We have **no quality edge** over any other cloud wrapper that hits the same MinerU API — OCR/table/formula output is **identical**. +- Self-hosting the MinerU engine gives the **same or better** accuracy (version-controllable, no upload caps). + +**Hard limits we cannot exceed:** 10MB/20-page free Agent tier, 200MB/200-page Standard tier, plus IP rate limits. Self-hosted tools have no such caps (only hardware). + +**Our benchmark is latency-only.** `tests/test_live.py` measures end-to-end cloud round-trip latency (~13–14s for the official demo PDF). It is **not** an accuracy benchmark; we have no OmniDocBench/olmOCR-Bench numbers of our own. + +### A note on the speed claim + +Our ~13–14s/doc cloud round-trip is **not** a clean win over self-hosted GPU engines. A normal self-host with a GPU runs at ~0.18s/page (Marker) or ~2.12 pages/sec (MinerU on A100) — far faster at any real scale. We only out-run **slow Apple-Silicon-CPU local runs of small docs** (e.g., M4 VLM at 32–148s/page). Do not frame "faster wall-clock" as a general win. + +### A note on benchmarks + +No single benchmark is authoritative. Different benchmarks favor different tools: +- **OmniDocBench** (v1.5/v1.6): MinerU2.5 **90.67** (v1.5), MinerU2.5-Pro **95.69** (v1.6) — leads, beating Gemini 2.5 Pro / GPT-4o / Qwen2.5-VL-72B on text/table/formula. Source: arXiv 2509.22186. +- **olmOCR-Bench** (Ai2, Oct 2025): olmOCR-2 **82.4** > Marker **76.1** > **MinerU 75.8**. Here MinerU **trails** — this is a real olmOCR win and must stay visible. +- **RD-TableBench**: Reducto 90.2% on complex tables — but Reducto authored this benchmark (vendor-biased). +- Mathpix is the de-facto formula-OCR standard (BLEU/edit-distance studies), though a PaddleOCR-VL-based tool claims to beat it on OmniDocBench v1.0 formula recognition, so the very top is contested. + +> Star counts / versions below (e.g. MinerU "65.7k / v3.2.1") are point-in-time and not independently re-verified. + +--- + +## Category 1 — Self-hosted / open-source parsing engines + +These are the tools that close our single biggest gap: **fully offline / air-gapped / no cloud / no upload caps.** + +### MinerU engine (opendatalab) — the engine we wrap +- **Source:** https://github.com/opendatalab/MinerU · arXiv 2509.22186 · https://huggingface.co/opendatalab/MinerU2.5-Pro-2604-1.2B +- **Strengths:** Owns the SOTA models (OmniDocBench 90.67 / 95.69-Pro v1.6). 109-language OCR, handwriting, cross-page table merge, formula→LaTeX (the source of *our* LaTeX). Fully self-hostable → offline, air-gappable, zero per-page cost, no caps. Pipeline backend runs pure CPU; VLM needs 8GB+ VRAM. Native MCP, Python/Go/TS SDKs, LangChain/LlamaIndex/Dify/FastGPT. +- **Weaknesses vs us:** Heavy install (multi-GB torch/vLLM + weights, 16GB RAM / 20GB disk floor); slow on Apple Silicon; no note/PKM delivery sinks; library/CLI rather than zero-config. +- **Verdict:** **Beats us** on offline, privacy, caps, accuracy ceiling, ecosystem. **We beat it** only on zero-install/zero-config and built-in delivery. + +### Marker (datalab-to / VikParuchuri) +- **Source:** https://github.com/datalab-to/marker · https://allenai.org/blog/olmocr-2 +- **Strengths:** Fully offline; very high batch throughput (~122 pages/sec/H100, 0.18s/page GPU); broad formats incl. EPUB; optional local-LLM (Ollama) quality boost with no data leaving the machine; ~35k+ stars, active. +- **Weaknesses:** **GPL-3.0** code + model weights under a modified RAIL-M (free only under ~$2M funding+revenue; commercial above that needs a Datalab license). olmOCR-Bench **76.1** — below olmOCR-2 and MinerU's OmniDocBench standing. +- **Verdict:** Beats us on offline/throughput; we beat it on zero-install and 17 delivery sinks. License gate is a real friction it has and we don't. + +### Docling (IBM / DS4SD) +- **Source:** https://github.com/docling-project/docling · https://huggingface.co/ibm-granite/granite-docling-258M · arXiv 2408.09869 +- **Strengths:** **Widest input modality set** (PDF/DOCX/PPTX/XLSX/HTML/AsciiDoc/LaTeX/CSV/images + **audio via ASR** + USPTO/JATS/XBRL). Tiny 258M Granite-Docling VLM runs on CPU/modest GPU. **MIT code + Apache-2.0 weights.** Deep framework ecosystem (LangChain/LlamaIndex/Haystack + official MCP), IBM-backed, 60k+ stars. Air-gapped by design. +- **Weaknesses:** Absolute accuracy lags MinerU on OmniDocBench/olmOCR-Bench; library-first (not a zero-config CLI); targets framework ingestion, not file delivery to note tools. +- **Verdict:** Beats us on offline, modality breadth, permissive license, ecosystem; we beat it on zero-install and note/PKM delivery. **Do not over-rank its MIT as uniquely best** — olmOCR's Apache-2.0 on *both* code and 7B weights is at least as commercially valuable. + +### olmOCR (allenai) +- **Source:** https://github.com/allenai/olmocr · https://allenai.org/blog/olmocr-2 · https://huggingface.co/datasets/allenai/olmOCR-bench +- **Strengths:** **Leads Ai2's olmOCR-Bench (82.4 vs MinerU 75.8)** — a benchmark where MinerU trails. **Apache-2.0 on code AND the olmOCR-2-7B weights** (most commercial-friendly model reuse here). Built for million-page LLM-training linearization. Offline. +- **Weaknesses:** **PDF/image only** (no Office/HTML); **English-primary**, filters non-English (MinerU does 109-lang); **requires a 12GB+ NVIDIA GPU, no CPU mode at all**. +- **Verdict:** Beats us on offline, that-benchmark accuracy, license, scale. We beat it on modality breadth, multilingual, no-GPU, delivery, zero-install. **Keep the olmOCR-Bench lead visible — do not cherry-pick only OmniDocBench.** + +### Nougat (facebookresearch / Meta AI) +- **Source:** https://github.com/facebookresearch/nougat · arXiv 2308.13418 +- **Strengths:** Strong LaTeX/math on arXiv-style scientific PDFs (its trained niche). Offline. +- **Weaknesses:** **PDF + English/Latin-script only** (no CJK); **CC-BY-NC weights (non-commercial)**; effectively **unmaintained** (last release Aug 2023); known repetition/hallucination/[MISSING_PAGE] failures off-distribution. +- **Verdict:** Offline + niche math is its only edge; we beat it on general-purpose, multilingual, maintenance, commercial license, delivery. + +### PyMuPDF4LLM (pymupdf / Artifex) +- **Source:** https://github.com/pymupdf/pymupdf4llm · https://pymupdf.io/blog/pymupdf-layout-10-faster-pdf-parsing-without-gpus +- **Strengths:** **Far faster and lighter than any ML tool on born-digital PDFs** (~hundreds of pages/sec on plain CPU; a C-optimized variant claims ~520 pages/sec). Lowest dependency/hardware footprint. Offline, no cloud, no caps. Ideal for huge clean-PDF corpora where speed > fidelity. +- **Weaknesses:** No ML → no real formula/LaTeX, weak complex tables, poor scanned/handwritten; slow external OCR; **AGPL-3.0 OR Artifex commercial**; Office formats need paid **PyMuPDF Pro**. +- **Verdict:** A genuine win for the speed-over-fidelity, clean-PDF use case. We beat it on hard-doc quality (MinerU's VLM), multilingual OCR, and delivery — but acknowledge its speed/footprint advantage honestly. + +### Zerox (getomni-ai) +- **Source:** https://github.com/getomni-ai/zerox +- **Strengths:** Trivial provider-flexibility (OpenAI/Azure/Bedrock-Claude/Gemini/Vertex); JSON-Schema structured extraction (Node SDK); MIT code. +- **Weaknesses:** **NOT offline and NOT token-free** — mandates a paid cloud vision-LLM key; needs graphicsmagick+ghostscript; **no published benchmarks**; per-page LLM cost can exceed MinerU on large jobs. +- **Verdict:** We beat it on token-free start, benchmarked accuracy, dedicated formula/table models, system-dep footprint, and delivery. It beats us on provider-swap flexibility and typed JSON extraction. + +--- + +## Category 2 — Commercial cloud document-parsing APIs + +Mostly **stronger than us** on enterprise accuracy, SLAs, structured extraction, and RAG/MCP ecosystems. Our honest edges are narrow: token-free + zero-install hosted default, clean Markdown/LaTeX of academic PDFs, and 17 delivery sinks none of them offer. + +### LlamaParse (LlamaIndex / LlamaCloud) +- **Source:** https://www.llamaindex.ai/pricing · LlamaCloud MCP docs +- **Beats us:** Official hosted **MCP server**; deep native RAG stack (parse→index→LlamaExtract/LlamaAgents); steerable NL parsing with frontier LLMs (GPT-4.1/Gemini 2.5 Pro); richer outputs (per-page JSON, XLSX, HTML tables, annotated PDF); enterprise SLAs; mature Python+TS SDKs. +- **We beat:** Token-free start (it needs a LlamaCloud key from page one); zero runtime deps; 17 note/PKM sinks (it delivers to RAG indexes, not note tools); built-in `--resume`/parallel batch CLI. + +### Mathpix (Convert API) +- **Source:** https://mathpix.com/pricing/api · https://mathpix.com/image-to-latex +- **Beats us:** **Best-in-class formula/equation OCR (printed AND handwritten) → clean LaTeX — clearly better than MinerU for pure math fidelity; concede this, do not imply parity.** Mature Snip ecosystem + Overleaf workflows; very low per-image cost at scale. +- **We beat:** Token-free start (Mathpix API requires a paid PAYG account, **$19.99 setup fee**, card on file; **no recurring free monthly allowance** — only a one-time $29 test credit; the consumer Snip app's free quota does **not** apply to the API); general-purpose multi-modal Office parsing; 17 delivery sinks; built-in batch CLI. + +### Unstructured.io +- **Source:** https://unstructured.io/pricing · https://github.com/Unstructured-IO/unstructured +- **Beats us:** **Apache-2.0 core library is fully self-hostable → 100% offline** (we cannot); official MCP + huge connector ecosystem (S3/SharePoint/vector DBs); built-in chunking+embedding (RAG-ready); 25+ file types; permissive license for product embedding. +- **We beat:** Token-free hosted default with zero install (its hosted API needs a key; self-host means running infra); cleaner human-readable Markdown out of the box (its primary output is JSON "elements"); 17 note/PKM sinks (it targets vector DBs/storage). *On parsing quality:* VLM parsing is generally stronger for complex layout/formula, but this is **not a benchmarked head-to-head** — state it as a tendency, not a measured win. + +### Reducto +- **Source:** https://reducto.ai/pricing +- **Beats us:** **Best complex/financial table extraction (90.2% RD-TableBench — vendor-authored but the strongest public evidence)**; agentic multi-pass OCR; SOC2/HIPAA, on-prem/VPC/air-gapped, enterprise SLAs; schema-based extraction with bounding boxes/citations. +- **We beat:** Token-free start (it needs a key + credits); zero-install plain CLI; 17 delivery sinks; auto-routing/--resume/parallel batch. + +### Chunkr (and similar RAG-native APIs) +- **Beats us:** Self-hostable (offline option we lack); RAG-native chunking + broad export (DOCX/HTML/LaTeX). +- **We beat:** Token-free start; zero-install; 17 note/PKM sinks. +- **Caveat (fact-check):** Do **not** claim "stronger VLM Markdown for formulas" — Chunkr cloud uses its own proprietary models and we have **no head-to-head benchmark**. Drop the quality claim; keep only the export-breadth and offline framing. + +--- + +## Category 3 — Other MinerU wrappers, skills & MCP servers (our direct peers) + +**Every cloud-backed wrapper here hits the same MinerU API we do, so its OCR/table/formula output is IDENTICAL to ours.** We have **no quality edge** over them — only DX differences. Claims of "better OCR/formula/Markdown" vs these are **invalid** and must not appear. + +### Official MinerU MCP server (mineru-open-mcp / MinerU-Ecosystem) +- **Source:** https://github.com/opendatalab/MinerU-Ecosystem · https://pypi.org/project/mineru-open-mcp/ +- **Beats us:** **Official, first-party** — tracks API/format changes day-one; native **MCP server** (stdio + streamable-http) in Claude Desktop/Cursor/Windsurf with zero glue; full ecosystem (Python/Go/TS SDKs, LangChain/LlamaIndex/Dify/FastGPT). **Same free no-token Flash tier as us** — our "free zero-token" edge is fully matched by the first party. +- **We beat:** Zero runtime deps (vs pip/uvx install); auto-routing Agent⇄Standard with auto-escalation; 17 delivery sinks; `--resume`/parallel batch; usable as a plain CLI outside any MCP host. + +### MinerU-Document-Explorer (official, opendatalab) +- **Source:** https://github.com/opendatalab/MinerU-Document-Explorer +- **Beats us:** Different, **larger** value prop — a local agent-native **knowledge engine** (BM25/vector/hybrid retrieval + deep-reading + LLM-wiki) with 15 MCP tools; runs 100% locally for its core; MIT, 568 stars. +- **We beat:** We're a focused zero-dep converter; broader conversion modalities; 17 delivery sinks (it keeps content in its own index/wiki); no Node/local-model download. + +### linxule/mineru-mcp (Node, cloud) +- **Source:** https://github.com/linxule/mineru-mcp +- **Beats us:** Native MCP server with 6 granular tools (explicit status-polling + batch-status pagination); first-class for Node/JS MCP stacks; batch up to 200 URLs/request. +- **We beat:** **Free no-token path** (it **requires** a token always); zero runtime deps (vs Node 18+); broader modalities (Excel/HTML); 17 delivery sinks; usable as plain CLI outside MCP. + +### mineru-converter-mcp-server (AvatarGanymede/MinerU-MCP) +- **Source:** https://pypi.org/project/mineru-converter-mcp-server/ +- **Beats us:** **Auto-splits PDFs >200MB and segments >600-page docs by page range — gracefully exceeding the 200MB/200-page cap we are bound by.** Turnkey Smithery + Render deploy (per-user key); explicit HTML input. +- **We beat:** Free no-token default (it requires a key); zero runtime deps; plain CLI (no MCP host/Render/Smithery needed); 17 sinks; auto-routing. + +### grimoire-skill (LeoLin990405) +- **Source:** https://github.com/LeoLin990405/grimoire-skill +- **Beats us:** Higher-level knowledge-capture ("parse once, share twice" → Obsidian notes + reusable skill packs); ingests **video** (YouTube/Bilibili) + subtitles (modalities we don't touch); cross-agent skill management; content-aware Obsidian auto-filing. +- **We beat:** Free no-token default (it needs a token + `--cloud-ok` for local files); zero runtime deps (vs bash+jq+awk + optional yt-dlp/ffmpeg); 17 sinks vs primarily Obsidian; broader Office/HTML; cross-platform single-file portability. + +### kesslerio/mineru-pdf-parser (openclaw/ClawHub skill, local CPU) +- **Source:** openclaw/skills · SKILL.md +- **Beats us:** **Fully local/offline (pure CPU, cross-platform)** — no cloud/token/caps; handles privacy-sensitive docs; native Markdown + JSON. +- **We beat:** Zero install (it needs a full local MinerU install + weights + shell wrapper); no GPU/heavy runtime; faster wall-clock **only vs slow local CPU**; broader modalities; 17 sinks; `--stdout`/`--json`; better docs. + +### nilecui/mineru-parser-skills (Claude Agent SDK, cloud) +- **Source:** https://github.com/nilecui/mineru-parser-skills +- **Beats us:** Built directly on the Claude Agent SDK (slots into Agent-SDK apps). Honestly little else — it's a thinner cloud wrapper. +- **We beat:** Accepts local files/dirs **and** URLs (it is **URL-only** — cannot parse a local PDF); free no-token default; zero runtime deps; batch/`--resume`/parallel; 17 sinks; broader modalities; mature/documented vs a 4-commit, no-license repo. *Caveat:* our "benchmarked" claim means **latency-measured**, not accuracy-benchmarked. + +### TINKPA/mcp-mineru (local MLX, Apple Silicon) +- **Source:** https://github.com/TINKPA/mcp-mineru +- **Beats us:** **Fully offline/local** via MinerU running on-device (MLX accel); no cloud/token/caps; data never leaves the Mac. +- **We beat:** Zero install/no weights/no GPU; **faster wall-clock only for typical multi-page docs vs its slow local inference (32–148s/page on M4)** — not a general speed win; broader modalities; batch/`--resume`/17 sinks; more active/documented; usable as plain CLI. + +--- + +## Summary of mandatory concessions (do not bury these) + +1. **Offline / air-gapped is our single biggest gap.** MinerU engine, Marker, Docling, olmOCR, Nougat, PyMuPDF4LLM, TINKPA, kesslerio, MinerU-Document-Explorer, and self-hostable Unstructured/Chunkr all run with **zero cloud dependency**. We are cloud-only and **cannot handle confidential/regulated/air-gapped content at all.** +2. **Data privacy:** every self-hosted competitor keeps documents on the machine; we **upload every file** to MinerU's cloud — a hard disqualifier for many regulated users. +3. **Accuracy is downstream of, and capped by, MinerU's cloud.** Self-hosting MinerU2.5-Pro gives the same-or-better accuracy with no caps. Same-backend wrappers yield **identical** quality to us. +4. **Hard caps:** 10MB/20-page (Agent), 200MB/200-page (Standard), IP rate limits. mineru-converter exceeds them via auto-split/segmentation. +5. **Mathpix beats us on formula/LaTeX OCR (incl. handwriting).** +6. **Reducto leads complex/financial tables; olmOCR leads olmOCR-Bench (82.4 vs MinerU 75.8).** Different benchmarks favor different tools — never cherry-pick only OmniDocBench. +7. **Official first-party advantage:** the official MinerU MCP/Document-Explorer + ecosystem track changes day-one and match our free tier; we are third-party, can lag, and ship **no MCP server**. +8. **Permissive-license wins we lack:** olmOCR (Apache-2.0 code + 7B weights), Docling (MIT + Apache-2.0 weights), Unstructured (Apache-2.0 core). +9. **PyMuPDF4LLM is far faster/lighter on born-digital PDFs** (clean-text corpora, speed > fidelity). + +## Sources + +- MinerU engine: https://github.com/opendatalab/MinerU · arXiv 2509.22186 · https://huggingface.co/opendatalab/MinerU2.5-Pro-2604-1.2B · https://neurohive.io/en/state-of-the-art/mineru2-5-open-source-1-2b-model-for-pdf-parsing-outperforms-gemini-2-5-pro-on-benchmarks/ +- Official MCP / ecosystem: https://github.com/opendatalab/MinerU-Ecosystem · https://pypi.org/project/mineru-open-mcp/ · https://github.com/opendatalab/MinerU-Document-Explorer +- Marker: https://github.com/datalab-to/marker · https://allenai.org/blog/olmocr-2 +- Docling: https://github.com/docling-project/docling · arXiv 2408.09869 · https://huggingface.co/ibm-granite/granite-docling-258M +- olmOCR: https://github.com/allenai/olmocr · https://allenai.org/blog/olmocr-2 · https://huggingface.co/datasets/allenai/olmOCR-bench +- Nougat: https://github.com/facebookresearch/nougat · arXiv 2308.13418 +- PyMuPDF4LLM: https://github.com/pymupdf/pymupdf4llm · https://pymupdf.io/blog/pymupdf-layout-10-faster-pdf-parsing-without-gpus +- Zerox: https://github.com/getomni-ai/zerox +- LlamaParse: https://www.llamaindex.ai/pricing +- Mathpix: https://mathpix.com/pricing/api · https://mathpix.com/image-to-latex +- Unstructured: https://unstructured.io/pricing · https://github.com/Unstructured-IO/unstructured +- Reducto: https://reducto.ai/pricing +- Other wrappers: https://github.com/linxule/mineru-mcp · https://pypi.org/project/mineru-converter-mcp-server/ · https://github.com/LeoLin990405/grimoire-skill · https://github.com/nilecui/mineru-parser-skills · https://github.com/TINKPA/mcp-mineru diff --git a/skills/developing/mineru/references/integrations.md b/skills/developing/mineru/references/integrations.md new file mode 100644 index 0000000..c23ffe5 --- /dev/null +++ b/skills/developing/mineru/references/integrations.md @@ -0,0 +1,59 @@ +# Delivery Integrations (`--to`) + +After parsing, MinerU Skill can deliver the Markdown straight into your content +tools using each tool's **official ingestion path** — no fragile generic block +converters. Targets are pluggable sinks; select one or more with `--to NAME` +(repeatable). List them live with `python3 scripts/mineru.py --list-sinks`. + +```bash +# Parse and fan out to several destinations at once +python3 scripts/mineru.py paper.pdf --to obsidian --to notion --to slack +``` + +Each sink reads its configuration from **environment variables** so an AI agent +can run it non-interactively. Delivery results appear in `--json` output under +each result's `sinks` array. + +## Support matrix + +| Target | `--to` | Native path | Auth / config (env) | Markdown fidelity | Images | +|--------|--------|-------------|---------------------|-------------------|--------| +| **Obsidian** | `obsidian` (`ob`) | filesystem write + YAML frontmatter | `OBSIDIAN_VAULT`, `OBSIDIAN_SUBDIR?` | full | ✅ copied to `.assets/` | +| **Logseq** | `logseq` | filesystem write, outline + `key:: value` | `LOGSEQ_GRAPH` | full (outline transform) | ✅ copied to `assets/` | +| **SiYuan** | `siyuan` | kernel `createDocWithMd` | `SIYUAN_TOKEN`, `SIYUAN_API_URL?`, `SIYUAN_NOTEBOOK?` | full (GFM) | ✅ `asset/upload` | +| **Notion** | `notion` | `POST /v1/pages` (blocks) | `NOTION_API_KEY`, `NOTION_PARENT_PAGE_ID`, `NOTION_VERSION?` | structure (headings/lists/code/quote) | ⚠️ text only¹ | +| **Linear** | `linear` | GraphQL `issueCreate` | `LINEAR_API_KEY`, `LINEAR_TEAM_ID` | full (Markdown-native) | ✅ base64-inlined | +| **Yuque 语雀** | `yuque` (`语雀`) | open API create doc | `YUQUE_TOKEN`, `YUQUE_NAMESPACE` | full (Markdown-native) | ⚠️ host publicly² | +| **Coda** | `coda` | page canvas `format:markdown` | `CODA_API_TOKEN`, `CODA_DOC_ID?` | full (Markdown-native) | ⚠️ public URL² | +| **Slack** | `slack` | external-upload `.md` file | `SLACK_BOT_TOKEN`, `SLACK_CHANNEL` | full (raw file) | ⚠️ not embedded | +| **Lark 飞书** | `feishu` (`lark`, `飞书`) | Drive `import_tasks` → Docx | `FEISHU_APP_ID`, `FEISHU_APP_SECRET`, `FEISHU_FOLDER_TOKEN?` | full (server-converted) | ⚠️ public URL² | +| **Confluence** | `confluence` | `POST /wiki/api/v2/pages` (storage) | `CONFLUENCE_BASE_URL`, `CONFLUENCE_EMAIL`, `CONFLUENCE_API_TOKEN`, `CONFLUENCE_SPACE_ID` | MD→HTML | ⚠️ not attached | +| **OneNote** | `onenote` | Graph `sections/{id}/pages` | `ONENOTE_TOKEN`³, `ONENOTE_SECTION_ID` | MD→HTML | ⚠️ remote only | +| **TickTick 滴答** | `ticktick` (`dida`, `滴答清单`) | `POST /open/v1/task` | `TICKTICK_TOKEN`, `TICKTICK_PROJECT_ID?` | task note | ❌ unsupported | +| **DingTalk 钉钉** | `dingtalk` (`钉钉`) | robot markdown webhook | `DINGTALK_WEBHOOK`, `DINGTALK_SECRET?` | markdown message | ⚠️ public URL only | +| **Airtable** | `airtable` | `POST /v0/{base}/{table}` record | `AIRTABLE_API_KEY`, `AIRTABLE_BASE_ID`, `AIRTABLE_TABLE`, `AIRTABLE_TITLE_FIELD?`, `AIRTABLE_BODY_FIELD?` | record field⁴ | ❌ not uploaded | +| **WeCom 企业微信** | `wecom` (`企业微信`) | app `message/send` markdown | `WECOM_CORPID`, `WECOM_CORPSECRET`, `WECOM_AGENTID`, `WECOM_TOUSER?` | message (subset, ≤2 KB)⁵ | ❌ unsupported | +| **Roam Research** ⁶ | `roam` | `batch-actions` block tree | `ROAM_API_TOKEN`, `ROAM_GRAPH_NAME` | full (Markdown→outline) | ⚠️ public URL | +| **WPS 金山文档** ⁶ | `wps` (`kdocs`, `金山`) | Markdown→DOCX → kdocs upload | `WPS_APP_ID`, `WPS_APP_SECRET`, `WPS_PARENT_PATH?` | DOCX (via html-for-docx) | embedded in DOCX | + +Notes: +1. **Notion** images need a separate `file_uploads` upload-then-reference dance; v1 delivers text + structure and notes the count of un-embedded local images. (Roadmap: image upload.) +2. Hosted services that ingest Markdown by value but have no first-class CLI asset upload — local images must be hosted at a public URL to render. The Markdown is delivered intact; image links that are already URLs work. +3. **OneNote** `ONENOTE_TOKEN` is a Microsoft Graph access token (delegated, scope `Notes.Create`). Obtain it via the device-code OAuth flow; the sink itself stays non-interactive. +4. **Airtable** is a database, not a document store — the doc is stored as one record (title + body fields). A good "save this doc as a row" target, not a document publisher. +5. **WeCom** markdown messages are a limited subset (≤2048 bytes, no images/tables, not rendered in the workbench). Best as a notification/summary; for a full document deliver via Lark/Notion and send the link. +6. **Optional-dependency sinks** — these two rely on a third-party library that the sink lazy-imports only when used, so the core and the other 15 sinks stay zero-dependency. If the library is absent, the sink returns a clear `pip install …` hint. They are implemented to the official specs but, being credential/desktop-gated, are best-effort until validated against live accounts. + +## Optional-dependency sinks (`[roam]`, `[wps]`) + +```bash +pip install "mineru-skill[wps]" # html-for-docx (Markdown → DOCX) +pip install "mineru-skill[roam]" # official roam-client SDK (git, needs Python ≥3.11) +# roam-client is git-only; equivalently: +pip install "roam-client @ git+https://github.com/Roam-Research/backend-sdks.git#subdirectory=python" +``` + +- **Roam** — no library ingests Markdown into Roam, but the official `roam-client` SDK handles the genuinely error-prone transport (307/308 peer-host redirect, dual `Authorization`/`x-authorization` Bearer headers, `/write`). We depend on it for transport and build only the Markdown→outline tree, delivering the whole document in one `batch-actions` request. Images must be public URLs. +- **WPS / 金山文档** — Markdown→DOCX uses the maintained pure-pip `html-for-docx` (reusing this project's Markdown→HTML); the kdocs upload signs requests with the documented WPS-2 scheme (plain SHA-1) using only the standard library. Requires an approved kdocs developer app + provisioned appspace. + +Adding more targets is a single small module — see `scripts/sinks/base.py`. PRs welcome. diff --git a/skills/developing/mineru/scripts/__init__.py b/skills/developing/mineru/scripts/__init__.py new file mode 100644 index 0000000..94cd8a3 --- /dev/null +++ b/skills/developing/mineru/scripts/__init__.py @@ -0,0 +1 @@ +"""Importable package for MinerU Skill console entry points.""" diff --git a/skills/developing/mineru/scripts/chunking.py b/skills/developing/mineru/scripts/chunking.py new file mode 100644 index 0000000..6e71984 --- /dev/null +++ b/skills/developing/mineru/scripts/chunking.py @@ -0,0 +1,88 @@ +"""Heading-aware Markdown chunking for RAG pipelines (zero-dependency). + +``chunk_markdown`` splits a parsed Markdown document into retrieval-sized chunks +that preserve heading context — matching the RAG-friendliness of LlamaParse / +Unstructured without any dependency. +""" + +from __future__ import annotations + +import re + +_HEADING = re.compile(r"^(#{1,6})\s+(.*)$") + + +def _slug(text: str) -> str: + text = (text or "doc").strip().lower() + text = re.sub(r"[^a-z0-9]+", "-", text).strip("-") + return text or "doc" + + +def _split_by_size(text: str, max_chars: int) -> list: + """Split text into <= max_chars pieces on paragraph boundaries (hard-split if needed).""" + if len(text) <= max_chars: + return [text] + pieces: list = [] + current = "" + for para in text.split("\n\n"): + if len(para) > max_chars: + if current: + pieces.append(current) + current = "" + for i in range(0, len(para), max_chars): + pieces.append(para[i:i + max_chars]) + elif not current: + current = para + elif len(current) + len(para) + 2 <= max_chars: + current = f"{current}\n\n{para}" + else: + pieces.append(current) + current = para + if current: + pieces.append(current) + return pieces + + +def chunk_markdown(markdown: str, *, max_chars: int = 2000, source: str = "") -> list: + """Chunk Markdown by heading, size-splitting long sections. + + Returns ``[{id, index, heading, text, chars, source}, ...]`` where ``heading`` + is the ``H1 > H2 > H3`` breadcrumb for the chunk. + """ + lines = markdown.replace("\r\n", "\n").split("\n") + chunks: list = [] + stack: list = [] # (level, text) heading breadcrumb + buf: list = [] + base = _slug(source) + + def breadcrumb() -> str: + return " > ".join(t for _, t in stack) + + def flush(): + text = "\n".join(buf).strip() + buf.clear() + if not text: + return + head = breadcrumb() + for piece in _split_by_size(text, max_chars): + idx = len(chunks) + chunks.append({ + "id": f"{base}-{idx}", + "index": idx, + "heading": head, + "text": piece, + "chars": len(piece), + "source": source, + }) + + for line in lines: + match = _HEADING.match(line.strip()) + if match: + flush() # close the previous section under its own breadcrumb + level = len(match.group(1)) + while stack and stack[-1][0] >= level: + stack.pop() + stack.append((level, match.group(2))) + buf.append(line) + flush() + return chunks diff --git a/skills/developing/mineru/scripts/local_engine.py b/skills/developing/mineru/scripts/local_engine.py new file mode 100644 index 0000000..f23afc2 --- /dev/null +++ b/skills/developing/mineru/scripts/local_engine.py @@ -0,0 +1,59 @@ +"""Optional fully-offline parsing backend for born-digital PDFs. + +Our single biggest honest gap is being cloud-only. ``--engine local`` parses a +PDF **entirely offline** with the optional, lightweight ``pymupdf4llm`` library +(no GPU, no cloud, no upload caps) — ideal for confidential or born-digital PDFs +where MinerU's cloud VLM is overkill. Scanned/complex docs still want the cloud +engine, so ``--engine auto`` only uses local when the PDF has real text. + + pip install "mineru-skill[local]" # i.e. pip install pymupdf4llm +""" + +from __future__ import annotations + +from pathlib import Path + +_HINT = ( + "--engine local needs pymupdf4llm — pip install 'mineru-skill[local]' " + "(i.e. pip install pymupdf4llm)" +) + + +class LocalEngineError(Exception): + """Raised when local parsing is requested but cannot be performed.""" + + +def available() -> bool: + try: + import pymupdf4llm # noqa: F401 + return True + except ImportError: + return False + + +def is_born_digital(path, min_chars: int = 200) -> bool: + """True if the PDF has extractable text (so local parsing is appropriate).""" + try: + import pymupdf + except ImportError: + return False + doc = pymupdf.open(str(path)) + total = 0 + for page in doc: + total += len(page.get_text().strip()) + if total >= min_chars: + return True + return total >= min_chars + + +def parse_local(path, output_dir=None) -> str: + """Parse a PDF to Markdown fully offline. Returns the Markdown string.""" + try: + import pymupdf4llm + except ImportError as exc: + raise LocalEngineError(_HINT) from exc + if output_dir is not None: + images = Path(output_dir) / "images" + images.mkdir(parents=True, exist_ok=True) + return pymupdf4llm.to_markdown(str(path), write_images=True, image_path=str(images)) + return pymupdf4llm.to_markdown(str(path)) diff --git a/skills/developing/mineru/scripts/mineru.py b/skills/developing/mineru/scripts/mineru.py new file mode 100644 index 0000000..0f8bdd4 --- /dev/null +++ b/skills/developing/mineru/scripts/mineru.py @@ -0,0 +1,1996 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.8" +# dependencies = [] +# /// +"""MinerU CLI — parse PDF / Office / image files into clean Markdown. + +Zero-dependency (Python standard library only) and AI-Native. The tool picks the +right MinerU backend automatically: + + * no token -> Agent API (free, no login; <=10 MB, <=20 pages) + * token + small file -> Agent API (fast & free, auto-escalates on limits) + * token + big/batch/fmt -> Standard v4 (<=200 MB, <=200 pages, docx/html/latex) + +Token: https://mineru.net/apiManage/token +Docs: https://mineru.net/apiManage/docs + +Examples +-------- + # Zero-config single file (no token needed) + python3 mineru.py paper.pdf + + # Pipe the Markdown straight back to an agent + python3 mineru.py paper.pdf --stdout + + # Batch a directory with a token (Standard API, parallel) + export MINERU_TOKEN=... + python3 mineru.py ./pdfs/ --output ./out/ --workers 8 --resume + + # Parse a remote URL and also export DOCX + python3 mineru.py https://example.com/doc.pdf --format docx +""" + +from __future__ import annotations + +import argparse +import http.client +import json +import os +import random +import re +import sys +import tempfile +import threading +import time +import urllib.error +import urllib.parse +import urllib.request +import zipfile +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field +from io import BytesIO +from pathlib import Path, PurePosixPath +from typing import Optional + +__version__ = "3.3.1" + +# --------------------------------------------------------------------------- # +# Constants (kept in sync with https://mineru.net/apiManage/docs) +# --------------------------------------------------------------------------- # +STANDARD_API = "https://mineru.net/api/v4" +AGENT_API = "https://mineru.net/api/v1/agent" + +AGENT_MAX_BYTES = 10 * 1024 * 1024 # 10 MB +AGENT_MAX_PAGES = 20 +STANDARD_MAX_BYTES = 200 * 1024 * 1024 # 200 MB +STANDARD_MAX_PAGES = 200 +BATCH_MAX_FILES = 50 # per Standard batch request +FREE_DAILY_PAGES = 1000 # highest-priority quota / day + +USER_AGENT = f"MinerU-Skill/{__version__}" + +# Reliability tuning ---------------------------------------------------------- # +# Transient HTTP statuses worth a backed-off retry (never business code != 0). +RETRY_STATUSES = {408, 425, 429, 500, 502, 503, 504} +RETRY_MAX_ATTEMPTS = 4 # total tries per request (1 + 3 retries) +RETRY_BASE_DELAY = 0.5 # seconds; doubles each attempt +RETRY_MAX_DELAY = 20.0 # backoff ceiling +DEFAULT_POLL_INTERVAL = 2.0 # seconds between status polls +POLL_INTERVAL_CAP = 15.0 # adaptive backoff ceiling while polling +DEFAULT_WORKERS = 8 # decoupled submit/poll lifts the old thread-bound ceiling +# Cap for poll/submit network calls. A single stalled request must not wedge the +# single-threaded poll loop for the whole per-parse budget — that budget lives in +# job.deadline; the per-request socket timeout is bounded here. Downloads/uploads +# (large zips) keep the full timeout. +REQUEST_TIMEOUT_CAP = 30.0 + +# Business-layer API codes that are worth a bounded retry. Authentication, +# quota, and file-limit failures are intentionally absent: those need user action +# or a different input, so retrying only burns time/quota. +RETRYABLE_API_CODES = {-10001, -60001, -60007, -60009} +FATAL_API_CODES = {"A0202", "A0211", -60005, -60006, -60017, -60018, -60019} +# Daily-quota / retry-limit codes. Once any submit hits one of these the whole run +# is doomed for the day, so trip a circuit breaker and skip the remaining submits +# instead of firing N more doomed requests (FREE_DAILY_PAGES informs the message). +QUOTA_EXHAUSTED_CODES = {-60017, -60018, -60019} + +# Input modalities MinerU understands, grouped so the CLI can report what it sees +# and so support stays single-sourced. The Agent API additionally rejects HTML. +MODALITY_SUFFIXES = { + "pdf": {".pdf"}, + "image": {".png", ".jpg", ".jpeg", ".jp2", ".webp", ".gif", ".bmp"}, + "word": {".doc", ".docx"}, + "slides": {".ppt", ".pptx"}, + "sheet": {".xls", ".xlsx"}, + "html": {".html"}, +} +SUPPORTED_SUFFIXES = {suf for group in MODALITY_SUFFIXES.values() for suf in group} + +# Error code -> actionable hint. Mirrors the official docs error tables. +ERROR_HINTS = { + "A0202": "Invalid token — check it or create a new one at https://mineru.net/apiManage/token", + "A0211": "Token expired — create a new one at https://mineru.net/apiManage/token", + -500: "Parameter error — check request parameters and Content-Type", + -10001: "Service error — please retry later", + -10002: "Invalid request parameters", + -60001: "Failed to generate upload URL — retry later", + -60002: "Unsupported file format — use a correct file extension", + -60003: "Failed to read file — the file may be corrupted", + -60004: "Empty file — upload a valid file", + -60005: "File too large — Standard API max is 200 MB", + -60006: "Too many pages — Standard API max is 200 pages, split the file", + -60007: "Model service temporarily unavailable — retry later", + -60008: "File read timeout — ensure the URL is reachable", + -60009: "Task queue is full — retry later", + -60010: "Parse failed — retry later", + -60011: "Failed to get a valid file — ensure the file was uploaded", + -60012: "Task not found — check the task_id", + -60013: "No permission to access this task", + -60014: "Cannot delete a running task", + -60015: "File conversion failed — try converting to PDF first", + -60016: "Format conversion failed — try another export format", + -60017: "Retry limit reached — try again after a model upgrade", + -60018: "Daily parse quota reached — try again tomorrow", + -60019: "Insufficient HTML parse quota — try again tomorrow", + -60020: "File split failed — retry later", + -60021: "Failed to read page count — retry later", + -60022: "Web page read failed — possibly rate-limited, retry later", + # Agent (lightweight) API specific codes + -30001: "File exceeds Agent API 10 MB limit — set MINERU_TOKEN to use the Standard API", + -30002: "Agent API does not support this file type — use PDF/image/Doc/PPT/Excel", + -30003: "Pages exceed Agent API 20-page limit — set MINERU_TOKEN or pass --pages", + -30004: "Invalid request parameters — check required fields", +} + +# Agent-API error codes that a Standard-API retry can recover from. +AGENT_ESCALATABLE = {-30001, -30003} + +# Terminal/transient task states (Standard + Agent share most of these). +STATE_DONE = "done" +STATE_FAILED = "failed" +ACTIVE_STATES = {"pending", "running", "converting", "uploading", "waiting-file"} + + +class MinerUError(Exception): + """Raised when the API returns a non-zero ``code`` or an unrecoverable error.""" + + def __init__(self, message: str, code=None): + super().__init__(message) + self.code = code + + +# --------------------------------------------------------------------------- # +# Options / results +# --------------------------------------------------------------------------- # +@dataclass(frozen=True) +class ParseOptions: + model: str = "vlm" + language: str = "ch" + is_ocr: bool = False + enable_formula: bool = True + enable_table: bool = True + page_ranges: Optional[str] = None + extra_formats: tuple = () + + +@dataclass +class ParseResult: + name: str + source: str + api: str = "agent" + modality: str = "unknown" + state: str = STATE_FAILED + output_dir: Optional[str] = None + markdown_path: Optional[str] = None + markdown: Optional[str] = None + task_id: Optional[str] = None + elapsed: Optional[float] = None + error: Optional[str] = None + sinks: list = field(default_factory=list) + chunks: Optional[list] = None + + def to_status(self) -> dict: + """Machine-readable status used by ``--json`` (omits the full markdown body).""" + status = { + "name": self.name, + "source": self.source, + "api": self.api, + "modality": self.modality, + "state": self.state, + "output_dir": self.output_dir, + "markdown_path": self.markdown_path, + "task_id": self.task_id, + "elapsed": self.elapsed, + "error": self.error, + "sinks": self.sinks, + } + if self.chunks is not None: + status["chunks"] = self.chunks + return status + + +# --------------------------------------------------------------------------- # +# Pure helpers (heavily unit-tested) +# --------------------------------------------------------------------------- # +def is_url(value: str) -> bool: + return value.startswith("http://") or value.startswith("https://") + + +def safe_stem(source: str) -> str: + """Derive a clean output folder name from a file path or URL.""" + tail = source.split("?", 1)[0].rstrip("/") + name = tail.rsplit("/", 1)[-1] if is_url(source) else Path(source).name + stem = Path(name).stem or "document" + return stem + + +def unique_out_stems(sources) -> list: + """On-disk folder/file stem per source, disambiguated ONLY on collision. + + Distinct basenames keep their bare stem (the documented output-dir contract). + When two inputs share a basename (``a/report.pdf`` + ``b/report.pdf``) the + later ones get a ``-2``/``-3`` suffix so neither silently overwrites the other + on disk or via ``--resume``. Stable for a given input order, so resume keeps + matching across re-runs. + """ + out, seen = [], {} + for src in sources: + base = safe_stem(src) + n = seen.get(base, 0) + seen[base] = n + 1 + out.append(base if n == 0 else f"{base}-{n + 1}") + return out + + +def safe_data_id(stem: str) -> str: + """data_id allows [A-Za-z0-9_.-], <=128 chars.""" + cleaned = "".join(c if (c.isalnum() or c in "_.-") else "-" for c in stem) + return cleaned[:128] or "document" + + +def suffix_of(source: str) -> str: + tail = source.split("?", 1)[0] + return Path(tail).suffix.lower() + + +def is_supported(source: str) -> bool: + return suffix_of(source) in SUPPORTED_SUFFIXES + + +def is_html(source: str) -> bool: + return suffix_of(source) == ".html" + + +def detect_modality(source: str) -> str: + """Classify the input modality (pdf/image/word/slides/sheet/html/url/unknown).""" + suffix = suffix_of(source) + if not suffix and is_url(source): + return "url" + for modality, suffixes in MODALITY_SUFFIXES.items(): + if suffix in suffixes: + return modality + return "unknown" + + +def to_agent_page_range(page_ranges: Optional[str]) -> Optional[str]: + """Agent API only supports ``from-to`` or a single page (no commas).""" + if not page_ranges: + return None + first = page_ranges.split(",", 1)[0].strip() + return first or None + + +def error_hint(code) -> str: + """Human-friendly hint for an API error code (falls back to the raw code).""" + if code in ERROR_HINTS: + return ERROR_HINTS[code] + return f"API error (code {code})" + + +def _result_error(data: dict, default: str = "Parse failed") -> str: + """Best available parse-task failure message, preserving documented codes.""" + msg = data.get("err_msg") or data.get("msg") + code = data.get("err_code") or data.get("code") + if msg: + return msg + if code is not None: + return error_hint(code) + return default + + +def choose_api( + *, + token: Optional[str], + source: str, + size_bytes: Optional[int], + batch: bool, + extra_formats, + explicit: str = "auto", +) -> str: + """Decide which backend to use. ``explicit`` of 'agent'/'standard' wins.""" + if explicit in ("agent", "standard"): + return explicit + # HTML is Standard-only (MinerU-HTML model); Agent API rejects it. + if is_html(source): + return "standard" + if not token: + return "agent" + if batch or extra_formats: + return "standard" + if size_bytes is not None and size_bytes > AGENT_MAX_BYTES: + return "standard" + return "agent" + + +def _pdf_page_count_if_available(source: str) -> Optional[int]: + """Best-effort local PDF page count. + + The core stays zero-dependency, so this only runs when the optional ``pypdf`` + module is installed. If it is unavailable or cannot read the PDF, callers + fall back to the API-side validation. + """ + if is_url(source) or suffix_of(source) != ".pdf": + return None + try: + import pypdf # type: ignore + except ImportError: + return None + try: + return len(pypdf.PdfReader(str(source)).pages) + except Exception: + return None + + +def _precheck_limits(source: str, api_kind: str, opts: ParseOptions) -> Optional[str]: + """Return a local limit error, or ``None`` when submission is allowed.""" + if is_url(source): + return None + try: + size = os.path.getsize(source) + except OSError as exc: + return str(exc) + if size <= 0: + return "Empty file — upload a valid file" + if api_kind == "standard" and size > STANDARD_MAX_BYTES: + return ERROR_HINTS[-60005] + if api_kind == "agent" and size > AGENT_MAX_BYTES: + return ERROR_HINTS[-30001] + + # A page range may intentionally select a capped subset, so avoid rejecting + # locally unless we know the full document must be sent. + if opts.page_ranges: + return None + pages = _pdf_page_count_if_available(source) + if pages is None: + return None + if api_kind == "standard" and pages > STANDARD_MAX_PAGES: + return ERROR_HINTS[-60006] + if api_kind == "agent" and pages > AGENT_MAX_PAGES: + return ERROR_HINTS[-30003] + return None + + +def _agent_page_limit_exceeded(source: str, opts: ParseOptions) -> bool: + if opts.page_ranges: + return False + pages = _pdf_page_count_if_available(source) + return pages is not None and pages > AGENT_MAX_PAGES + + +# --------------------------------------------------------------------------- # +# HTTP seam +# --------------------------------------------------------------------------- # +# Two layers: +# * ``_send_once`` — one keep-alive request (reuses a per-thread connection so +# the frequent poll traffic stops re-doing the TLS handshake). +# * ``_http`` — retry/backoff wrapper around ``_send_once``. This is the +# single place the unit tests monkeypatch, so the retry and +# keep-alive machinery is transparent to them; the retry path +# is exercised separately by patching ``_send_once``. +_conn_local = threading.local() + + +def _backoff_delay(attempt: int, retry_after=None) -> float: + """Exponential backoff with jitter; honors a server ``Retry-After`` if given.""" + if retry_after is not None: + try: + return min(float(retry_after), RETRY_MAX_DELAY) + except (TypeError, ValueError): + pass + ceiling = min(RETRY_MAX_DELAY, RETRY_BASE_DELAY * (2 ** attempt)) + return ceiling * (0.5 + random.random() / 2) # jitter in [0.5x, 1.0x] + + +def _next_poll_interval(interval, *, progressed, base) -> float: + """Adaptive poll backoff for ONE group: reset to ``base`` on progress, else + grow geometrically toward ``POLL_INTERVAL_CAP``. Kept per-group so a fast batch + cannot reset (or a stuck batch inflate) the polling cadence of an unrelated one.""" + if progressed: + return base + return min(interval * 1.5, POLL_INTERVAL_CAP) + + +def _should_retry_status(status) -> bool: + return status in RETRY_STATUSES + + +def _is_retryable_api_error(exc: MinerUError) -> bool: + if exc.code in FATAL_API_CODES: + return False + return exc.code == 429 or exc.code in RETRYABLE_API_CODES + + +def _conn_pool() -> dict: + pool = getattr(_conn_local, "pool", None) + if pool is None: + pool = {} + _conn_local.pool = pool + return pool + + +def _drop_conn(key) -> None: + conn = _conn_pool().pop(key, None) + if conn is not None: + try: + conn.close() + except OSError: + pass + + +def _get_conn(scheme, host, port, timeout): + key = (scheme, host, port) + pool = _conn_pool() + conn = pool.get(key) + if conn is None: + if scheme == "https": + conn = http.client.HTTPSConnection(host, port or 443, timeout=timeout) + else: + conn = http.client.HTTPConnection(host, port or 80, timeout=timeout) + pool[key] = conn + else: + conn.timeout = timeout + return conn, key + + +def _content_length(data): + try: + return len(data) + except (TypeError, AttributeError): + return None + + +def _send_once(method, url, *, headers=None, data=None, timeout=60, _redirects=5): + """One HTTP request over a reused keep-alive connection. + + Returns ``(status_code, body_bytes, retry_after)``. Raises ``urllib.error.URLError`` + on a network-level failure (so the retry layer can back off). + """ + parts = urllib.parse.urlsplit(url) + scheme, host, port = parts.scheme, parts.hostname, parts.port + path = urllib.parse.urlunsplit(("", "", parts.path or "/", parts.query, "")) or "/" + send_headers = dict(headers or {}) + send_headers.setdefault("User-Agent", USER_AGENT) + if data is not None and "Content-Length" not in send_headers: + length = _content_length(data) + if length is not None: + send_headers["Content-Length"] = str(length) + + # Try once on the pooled (possibly stale) connection; on a connection-level + # error reconnect once before surfacing it to the retry layer. + for stale_attempt in (0, 1): + conn, key = _get_conn(scheme, host, port, timeout) + try: + conn.request(method, path, body=data, headers=send_headers) + resp = conn.getresponse() + status = resp.status + body = resp.read() + if resp.getheader("Connection", "").lower() == "close" or resp.version == 10: + _drop_conn(key) + if status in (301, 302, 303, 307, 308) and _redirects > 0: + location = resp.getheader("Location") + if location: + nxt = urllib.parse.urljoin(url, location) + nmethod = "GET" if status in (301, 302, 303) and method != "HEAD" else method + ndata = None if nmethod != method else data + if ndata is not None and hasattr(ndata, "seek"): + try: + ndata.seek(0) + except OSError: + pass + return _send_once(nmethod, nxt, headers=headers, data=ndata, + timeout=timeout, _redirects=_redirects - 1) + return status, body, resp.getheader("Retry-After") + except (http.client.HTTPException, ConnectionError, OSError) as exc: + _drop_conn(key) + if stale_attempt == 0: + if hasattr(data, "seek"): + try: + data.seek(0) + except OSError: + pass + continue # pooled connection was stale — reconnect and retry once + raise urllib.error.URLError(exc) + raise urllib.error.URLError("connection failed") # pragma: no cover - defensive + + +def _http(method, url, *, headers=None, data=None, timeout=60): + """Perform one HTTP request with bounded exponential backoff on transient + failures (429/5xx/network). Returns ``(status_code, body_bytes)``. + + Business errors (HTTP 200 with ``code != 0``) are *not* retried here — that is + the caller's concern in :func:`_api_json`. + """ + for attempt in range(RETRY_MAX_ATTEMPTS): + if hasattr(data, "seek"): + try: + data.seek(0) + except OSError: + pass + status = None + retry_after = None + last_exc = None + try: + status, body, retry_after = _send_once( + method, url, headers=headers, data=data, timeout=timeout + ) + except urllib.error.URLError as exc: + last_exc = exc + if status is not None and not _should_retry_status(status): + return status, body + if attempt + 1 < RETRY_MAX_ATTEMPTS: + time.sleep(_backoff_delay(attempt, retry_after if status is not None else None)) + continue + # Retries exhausted. + if status == 429: + raise MinerUError( + "Rate limited (HTTP 429) — slow down, lower --workers, or set a token", + code=429, + ) + if status is not None: + return status, body # surface the last 5xx body so _api_json can report it + raise MinerUError(f"Network error after {RETRY_MAX_ATTEMPTS} attempts: {last_exc}") + + +def _api_json(method, url, *, token=None, payload=None, timeout=60) -> dict: + """Call a MinerU JSON endpoint and return ``data``, raising on ``code != 0``.""" + headers = {"Accept": "*/*"} + body = None + if payload is not None: + headers["Content-Type"] = "application/json" + body = json.dumps(payload).encode("utf-8") + if token: + headers["Authorization"] = f"Bearer {token}" + for attempt in range(RETRY_MAX_ATTEMPTS): + status, raw = _http(method, url, headers=headers, data=body, timeout=timeout) + try: + parsed = json.loads(raw.decode("utf-8")) + except (ValueError, UnicodeDecodeError): + raise MinerUError(f"Non-JSON response (HTTP {status}) from {url}") + # MinerU returns two envelopes: the business layer uses {code, data, msg} + # while the auth/gateway layer uses {success, msgCode, msg} (e.g. on a bad + # token). Handle both so credential errors surface clearly. + if parsed.get("success") is False: + code = parsed.get("msgCode") or parsed.get("code") + hint = ERROR_HINTS.get(code) or parsed.get("msg") or error_hint(code) + raise MinerUError(hint, code=code) + code = parsed.get("code") + if not (200 <= status < 300) and code in (0, None): + raise MinerUError(f"HTTP {status} from {url}", code=status) + if code not in (0, None): + if code in RETRYABLE_API_CODES and attempt + 1 < RETRY_MAX_ATTEMPTS: + time.sleep(_backoff_delay(attempt)) + continue + raise MinerUError(error_hint(code), code=code) + data = parsed.get("data") + if data is None and code is None and "success" not in parsed: + raise MinerUError(f"Unexpected response (HTTP {status}) from {url}") + return data or {} + raise MinerUError(f"API retry exhausted for {url}") # pragma: no cover - defensive + + +def _put_file(upload_url: str, path: str, timeout=300) -> None: + """Upload a local file to a signed OSS URL (no Content-Type per docs).""" + headers = {"Content-Length": str(os.path.getsize(path))} + with open(path, "rb") as handle: + status, _ = _http("PUT", upload_url, headers=headers, data=handle, timeout=timeout) + if status not in (200, 201, 203): + raise MinerUError(f"Upload failed (HTTP {status})") + + +def _download(url: str, timeout=300) -> bytes: + status, raw = _http("GET", url, timeout=timeout) + if status != 200: + raise MinerUError(f"Download failed (HTTP {status})") + return raw + + +def _download_to_path(url: str, dest: Path, *, timeout=300) -> Path: + """Stream a (potentially large) download straight to disk in chunks. + + Used for result zips so a batch worker never buffers the whole archive in RAM. + Retries transient failures with backoff, mirroring :func:`_http`. + """ + for attempt in range(RETRY_MAX_ATTEMPTS): + try: + req = urllib.request.Request(url, method="GET", headers={"User-Agent": USER_AGENT}) + with urllib.request.urlopen(req, timeout=timeout) as resp: + if resp.getcode() != 200: + raise MinerUError(f"Download failed (HTTP {resp.getcode()})") + with open(dest, "wb") as handle: + while True: + chunk = resp.read(65536) + if not chunk: + break + handle.write(chunk) + return dest + except urllib.error.HTTPError as exc: + if _should_retry_status(exc.code) and attempt + 1 < RETRY_MAX_ATTEMPTS: + time.sleep(_backoff_delay(attempt, exc.headers.get("Retry-After"))) + continue + raise MinerUError(f"Download failed (HTTP {exc.code})", code=exc.code) + except urllib.error.URLError as exc: + if attempt + 1 < RETRY_MAX_ATTEMPTS: + time.sleep(_backoff_delay(attempt)) + continue + raise MinerUError(f"Download failed: {exc}") + raise MinerUError("Download failed after retries") # pragma: no cover - defensive + + +# --------------------------------------------------------------------------- # +# Agent API (lightweight, no token) +# --------------------------------------------------------------------------- # +def _agent_payload(opts: ParseOptions) -> dict: + payload = { + "language": opts.language, + "enable_table": opts.enable_table, + "is_ocr": opts.is_ocr, + "enable_formula": opts.enable_formula, + } + page_range = to_agent_page_range(opts.page_ranges) + if page_range: + payload["page_range"] = page_range + return payload + + +def agent_parse(source: str, opts: ParseOptions, *, poll_interval=3, timeout=600): + """Parse one URL or file via the Agent API. Returns the Markdown text.""" + if is_url(source): + payload = {"url": source, **_agent_payload(opts)} + data = _api_json("POST", f"{AGENT_API}/parse/url", payload=payload) + task_id = data["task_id"] + else: + payload = {"file_name": Path(source).name, **_agent_payload(opts)} + data = _api_json("POST", f"{AGENT_API}/parse/file", payload=payload) + task_id = data["task_id"] + _put_file(data["file_url"], source, timeout=timeout) + markdown = _agent_poll(task_id, poll_interval=poll_interval, timeout=timeout) + return markdown, task_id + + +def _agent_poll(task_id, *, poll_interval, timeout) -> str: + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + data = _api_json("GET", f"{AGENT_API}/parse/{task_id}", timeout=timeout) + state = data.get("state") + if state == STATE_DONE: + return _download(data["markdown_url"], timeout=timeout).decode("utf-8", errors="replace") + if state == STATE_FAILED: + raise MinerUError( + data.get("err_msg") or error_hint(data.get("err_code")), + code=data.get("err_code"), + ) + time.sleep(poll_interval) + raise MinerUError("Agent parse timed out") + + +# --------------------------------------------------------------------------- # +# Standard API (v4, token required) +# --------------------------------------------------------------------------- # +def _standard_model(opts: ParseOptions, source: str) -> str: + return "MinerU-HTML" if is_html(source) else opts.model + + +def _standard_submit_and_poll(source, opts, token, *, poll_interval=3, timeout=600): + """Submit one URL/file to the Standard API and poll to completion. + + Returns ``(full_zip_url, task_or_batch_id)`` — the caller decides whether to + buffer (small) or stream (large) the result zip. The submit POST uses a capped + socket timeout; the full ``timeout`` stays the parse budget for polling. + """ + model = _standard_model(opts, source) + req_timeout = min(timeout, REQUEST_TIMEOUT_CAP) + if is_url(source): + payload = { + "url": source, + "model_version": model, + "is_ocr": opts.is_ocr, + "enable_formula": opts.enable_formula, + "enable_table": opts.enable_table, + "language": opts.language, + } + if opts.page_ranges: + payload["page_ranges"] = opts.page_ranges + if opts.extra_formats: + payload["extra_formats"] = list(opts.extra_formats) + data = _api_json("POST", f"{STANDARD_API}/extract/task", token=token, + payload=payload, timeout=req_timeout) + zip_url = _standard_poll_task(data["task_id"], token, poll_interval=poll_interval, timeout=timeout) + return zip_url, data["task_id"] + + # Local file: request a signed upload URL, PUT the bytes, then poll the batch. + file_entry = {"name": Path(source).name, "data_id": safe_data_id(safe_stem(source))} + if opts.is_ocr: + file_entry["is_ocr"] = True + if opts.page_ranges: + file_entry["page_ranges"] = opts.page_ranges + payload = { + "files": [file_entry], + "model_version": model, + "enable_formula": opts.enable_formula, + "enable_table": opts.enable_table, + "language": opts.language, + } + if opts.extra_formats: + payload["extra_formats"] = list(opts.extra_formats) + data = _api_json("POST", f"{STANDARD_API}/file-urls/batch", token=token, + payload=payload, timeout=req_timeout) + batch_id = data["batch_id"] + _put_file(data["file_urls"][0], source, timeout=timeout) + zip_url = _standard_poll_batch(batch_id, token, Path(source).name, poll_interval=poll_interval, timeout=timeout) + return zip_url, batch_id + + +def standard_parse( + source: str, opts: ParseOptions, token: str, *, poll_interval=3, timeout=600 +): + """Parse one URL or file via the Standard API. Returns ``(zip_bytes, id)``.""" + zip_url, task_id = _standard_submit_and_poll( + source, opts, token, poll_interval=poll_interval, timeout=timeout + ) + return _download(zip_url, timeout=timeout), task_id + + +def _standard_parse_to_dir(source, opts, token, out_stem, output_dir, *, poll_interval=3, timeout=600): + """Standard parse of a single input, streaming the result zip to disk rather + than buffering the whole archive in RAM. Returns ``(md_path, id)``.""" + zip_url, task_id = _standard_submit_and_poll( + source, opts, token, poll_interval=poll_interval, timeout=timeout + ) + target_dir = output_dir / out_stem + target_dir.mkdir(parents=True, exist_ok=True) + zip_path = target_dir / "._result.zip.partial" + _download_to_path(zip_url, zip_path, timeout=timeout) + md_path = extract_zip_path(out_stem, zip_path, output_dir) + try: + zip_path.unlink() + except OSError: + pass + return md_path, task_id + + +def _standard_poll_task(task_id, token, *, poll_interval, timeout) -> str: + deadline = time.monotonic() + timeout + req_timeout = min(timeout, REQUEST_TIMEOUT_CAP) + while time.monotonic() < deadline: + data = _api_json("GET", f"{STANDARD_API}/extract/task/{task_id}", token=token, + timeout=req_timeout) + state = data.get("state") + if state == STATE_DONE: + return data["full_zip_url"] + if state == STATE_FAILED: + raise MinerUError(_result_error(data), code=data.get("err_code")) + time.sleep(poll_interval) + raise MinerUError("Standard parse timed out") + + +def _standard_poll_batch(batch_id, token, file_name, *, poll_interval, timeout) -> str: + deadline = time.monotonic() + timeout + req_timeout = min(timeout, REQUEST_TIMEOUT_CAP) + while time.monotonic() < deadline: + data = _api_json("GET", f"{STANDARD_API}/extract-results/batch/{batch_id}", + token=token, timeout=req_timeout) + for entry in data.get("extract_result", []): + if entry.get("file_name") != file_name: + continue + state = entry.get("state") + if state == STATE_DONE: + return entry["full_zip_url"] + if state == STATE_FAILED: + raise MinerUError(_result_error(entry), code=entry.get("err_code")) + time.sleep(poll_interval) + raise MinerUError("Standard parse timed out") + + +# --------------------------------------------------------------------------- # +# Output writing +# --------------------------------------------------------------------------- # +def write_markdown(stem: str, markdown: str, output_dir: Path) -> Path: + """Write a bare Markdown string (Agent API result) to ``//.md``. + + The write is atomic (temp file + ``os.replace``) so an interrupted run can never + leave a half-written ``.md`` that ``--resume`` would mistake for a finished file. + """ + target_dir = output_dir / stem + target_dir.mkdir(parents=True, exist_ok=True) + md_path = target_dir / f"{stem}.md" + tmp_path = target_dir / f".{stem}.md.partial" + tmp_path.write_text(markdown, encoding="utf-8") + os.replace(tmp_path, md_path) + return md_path + + +def _finalize_zip_dir(target_dir: Path, stem: str) -> Path: + """Rename the archive's ``full.md`` to ``.md``. + + With no ``full.md`` (and no already-correct ``.md``), pick the largest + ``*.md`` — the most likely full body — tie-broken by name, so the choice is + deterministic instead of filesystem ``glob`` order. + """ + full_md = target_dir / "full.md" + md_path = target_dir / f"{stem}.md" + if full_md.exists(): + full_md.replace(md_path) + elif not md_path.exists(): + candidates = sorted(target_dir.glob("*.md"), key=lambda p: (-p.stat().st_size, p.name)) + if candidates: + candidates[0].replace(md_path) + return md_path + + +def _validate_zip_member(info: zipfile.ZipInfo) -> None: + name = info.filename + parts = PurePosixPath(name).parts + if PurePosixPath(name).is_absolute() or ".." in parts: + raise MinerUError(f"Unsafe zip member path: {name}") + file_type = (info.external_attr >> 16) & 0o170000 + if file_type == 0o120000: + raise MinerUError(f"Unsafe zip symlink: {name}") + + +def _safe_extract_zip(archive: zipfile.ZipFile, target_dir: Path) -> None: + for info in archive.infolist(): + _validate_zip_member(info) + archive.extractall(target_dir) + + +def write_zip(stem: str, zip_bytes: bytes, output_dir: Path) -> Path: + """Extract a Standard API result zip and return the path to the renamed Markdown.""" + target_dir = output_dir / stem + target_dir.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(BytesIO(zip_bytes)) as archive: + _safe_extract_zip(archive, target_dir) + return _finalize_zip_dir(target_dir, stem) + + +def extract_zip_path(stem: str, zip_path: Path, output_dir: Path) -> Path: + """Extract a result zip already on disk (streamed download) without buffering it.""" + target_dir = output_dir / stem + target_dir.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(zip_path) as archive: + _safe_extract_zip(archive, target_dir) + return _finalize_zip_dir(target_dir, stem) + + +def copy_to_obsidian(md_path: Path, stem: str, vault: Path) -> Path: + """Copy the parsed Markdown (and sibling images) into an Obsidian vault folder.""" + vault.mkdir(parents=True, exist_ok=True) + dest = vault / f"{stem}.md" + dest.write_text(md_path.read_text(encoding="utf-8"), encoding="utf-8") + images = md_path.parent / "images" + if images.is_dir(): + dest_images = vault / "images" + dest_images.mkdir(exist_ok=True) + for img in images.iterdir(): + if img.is_file(): + (dest_images / img.name).write_bytes(img.read_bytes()) + return dest + + +# --------------------------------------------------------------------------- # +# Per-input orchestration +# --------------------------------------------------------------------------- # +def process_one( + source: str, + opts: ParseOptions, + *, + token: Optional[str], + output_dir: Path, + api: str = "auto", + obsidian: Optional[Path] = None, + resume: bool = False, + poll_interval: float = 3, + timeout: float = 600, + engine: str = "cloud", + out_stem: Optional[str] = None, +) -> ParseResult: + """Parse a single input end to end, choosing the backend and writing output.""" + stem = safe_stem(source) + out_stem = out_stem or stem # on-disk identity (disambiguated by the caller on collision) + result = ParseResult(name=stem, source=source, modality=detect_modality(source)) + started = time.monotonic() + + if resume and (output_dir / out_stem / f"{out_stem}.md").exists(): + result.state = "skipped" + result.output_dir = str(output_dir / out_stem) + result.markdown_path = str(output_dir / out_stem / f"{out_stem}.md") + return result + + if engine in ("local", "auto") and not is_url(source) and suffix_of(source) == ".pdf": + local = _load_local_engine() + use_local = engine == "local" + if engine == "auto" and local is not None: + try: + use_local = local.available() and local.is_born_digital(source) + except Exception: + use_local = False + if use_local and local is not None: + try: + markdown = local.parse_local(source) + md_path = write_markdown(out_stem, markdown, output_dir) + result.api = "local" + result.markdown = markdown + return _finalize(result, out_stem, output_dir, md_path, obsidian, started) + except Exception as exc: # LocalEngineError or parse failure + if engine == "local": + result.state = STATE_FAILED + result.error = str(exc) + return result + # auto: fall through to the cloud engine + + size_bytes = None + if not is_url(source): + try: + size_bytes = os.path.getsize(source) + except OSError: + pass + + chosen = choose_api( + token=token, + source=source, + size_bytes=size_bytes, + batch=False, + extra_formats=opts.extra_formats, + explicit=api, + ) + if chosen == "agent" and api == "auto" and token and _agent_page_limit_exceeded(source, opts): + chosen = "standard" + + precheck_error = _precheck_limits(source, chosen, opts) + if precheck_error: + result.api = chosen + result.state = STATE_FAILED + result.error = precheck_error + return result + + try: + result.api = chosen + if chosen == "standard": + if not token: + raise MinerUError( + "Standard API needs a token — set MINERU_TOKEN " + "(https://mineru.net/apiManage/token)" + ) + md_path, task_id = _standard_parse_to_dir( + source, opts, token, out_stem, output_dir, + poll_interval=poll_interval, timeout=timeout + ) + result.task_id = task_id + result.markdown = md_path.read_text(encoding="utf-8") if md_path.exists() else "" + else: + try: + markdown, task_id = agent_parse( + source, opts, poll_interval=poll_interval, timeout=timeout + ) + except MinerUError as exc: + # Auto-escalate to the Standard API when a token is available. + if api == "auto" and token and exc.code in AGENT_ESCALATABLE: + result.api = "standard" + md_path, task_id = _standard_parse_to_dir( + source, opts, token, out_stem, output_dir, + poll_interval=poll_interval, timeout=timeout + ) + result.task_id = task_id + result.markdown = md_path.read_text(encoding="utf-8") if md_path.exists() else "" + return _finalize(result, out_stem, output_dir, md_path, obsidian, started) + raise + result.task_id = task_id + md_path = write_markdown(out_stem, markdown, output_dir) + result.markdown = markdown + return _finalize(result, out_stem, output_dir, md_path, obsidian, started) + except MinerUError as exc: + result.state = STATE_FAILED + result.error = str(exc) + return result + except (OSError, urllib.error.URLError) as exc: + result.state = STATE_FAILED + result.error = str(exc) + return result + except Exception as exc: # noqa: BLE001 - one malformed response must not abort the batch + result.state = STATE_FAILED + result.error = f"{type(exc).__name__}: {exc}" + return result + + +def _finalize(result, out_stem, output_dir, md_path, obsidian, started=None) -> ParseResult: + # A result that yielded no markdown on disk is a failure, not a silent "done" + # pointing at a nonexistent/empty body. + if md_path is None or not Path(md_path).exists(): + result.state = STATE_FAILED + result.error = result.error or "no markdown in result" + return result + result.state = STATE_DONE + result.output_dir = str(output_dir / out_stem) + result.markdown_path = str(md_path) + if started is not None: + result.elapsed = round(time.monotonic() - started, 2) + if obsidian is not None: + copy_to_obsidian(md_path, out_stem, obsidian) + return result + + +# --------------------------------------------------------------------------- # +# Multi-input pipeline: decoupled submit -> poll -> download +# --------------------------------------------------------------------------- # +# The single-file primitives above each block a thread for the whole parse +# (submit, then ``time.sleep`` poll loop). That caps real concurrency at the +# worker count and wastes threads sleeping. For batches we instead: +# 1. submit every input up front (Standard local files coalesced into one +# ``/file-urls/batch`` call of up to 50 files — the endpoint's real purpose), +# 2. poll all outstanding tickets from one place with adaptive backoff (a single +# batch poll returns the state of every file in that batch), and +# 3. download + extract completed results in a small pool as soon as each is done. +# Nothing sleeps holding a parse slot, and Standard API/poll traffic collapses ~Nx. +@dataclass +class _Job: + source: str + stem: str + api: str + is_url: bool + result: ParseResult + out_stem: str = "" + data_id: str = "" + file_name: str = "" + poll_kind: str = "" # "agent" | "task" | "batch" + poll_id: str = "" + download_url: str = "" + download_kind: str = "" # "md" | "zip" + deadline: float = 0.0 + started: float = 0.0 + finished: bool = False # resolved during planning/submit (skip or hard-fail) + + +def _plan_jobs(sources, opts, *, token, output_dir, api, resume, batch_mode=False) -> list: + """Resolve each source's backend and short-circuit already-parsed inputs.""" + jobs = [] + out_stems = unique_out_stems(sources) + for idx, src in enumerate(sources): + stem = safe_stem(src) + out_stem = out_stems[idx] + res = ParseResult(name=stem, source=src, modality=detect_modality(src)) + url = is_url(src) + job = _Job( + source=src, stem=stem, out_stem=out_stem, api="agent", is_url=url, result=res, + file_name=(safe_stem(src) if url else Path(src).name), + data_id=f"{safe_data_id(stem)}-{idx}", + ) + if resume and (output_dir / out_stem / f"{out_stem}.md").exists(): + res.state = "skipped" + res.output_dir = str(output_dir / out_stem) + res.markdown_path = str(output_dir / out_stem / f"{out_stem}.md") + job.finished = True + jobs.append(job) + continue + size_bytes = None + if not url: + try: + size_bytes = os.path.getsize(src) + except OSError: + pass + job.api = choose_api( + token=token, source=src, size_bytes=size_bytes, batch=batch_mode, + extra_formats=opts.extra_formats, explicit=api, + ) + if job.api == "agent" and api == "auto" and token and _agent_page_limit_exceeded(src, opts): + job.api = "standard" + res.api = job.api + precheck_error = _precheck_limits(src, job.api, opts) + if precheck_error: + res.state = STATE_FAILED + res.error = precheck_error + job.finished = True + jobs.append(job) + return jobs + + +def _reserve_single(job, opts, token, timeout) -> list: + """Submit one non-batch input; return upload work ``[(job, url)]`` (empty for URLs).""" + src = job.source + job.deadline = time.monotonic() + timeout + req_timeout = min(timeout, REQUEST_TIMEOUT_CAP) + if job.api == "agent": + if job.is_url: + data = _api_json("POST", f"{AGENT_API}/parse/url", + payload={"url": src, **_agent_payload(opts)}, timeout=req_timeout) + job.poll_kind, job.poll_id = "agent", data["task_id"] + job.result.task_id = data["task_id"] + return [] + data = _api_json("POST", f"{AGENT_API}/parse/file", + payload={"file_name": Path(src).name, **_agent_payload(opts)}, + timeout=req_timeout) + job.poll_kind, job.poll_id = "agent", data["task_id"] + job.result.task_id = data["task_id"] + return [(job, data["file_url"])] + # Standard API URL -> single extract task (only local files coalesce into batches). + model = _standard_model(opts, src) + payload = { + "url": src, "model_version": model, "is_ocr": opts.is_ocr, + "enable_formula": opts.enable_formula, "enable_table": opts.enable_table, + "language": opts.language, + } + if opts.page_ranges: + payload["page_ranges"] = opts.page_ranges + if opts.extra_formats: + payload["extra_formats"] = list(opts.extra_formats) + data = _api_json("POST", f"{STANDARD_API}/extract/task", token=token, + payload=payload, timeout=req_timeout) + job.poll_kind, job.poll_id = "task", data["task_id"] + job.result.task_id = data["task_id"] + return [] + + +def _reserve_batch(batch_jobs, opts, token, timeout) -> list: + """Submit one ``/file-urls/batch`` of up to 50 files; return upload work list.""" + model = _standard_model(opts, batch_jobs[0].source) + files = [] + for job in batch_jobs: + entry = {"name": Path(job.source).name, "data_id": job.data_id} + if opts.is_ocr: + entry["is_ocr"] = True + if opts.page_ranges: + entry["page_ranges"] = opts.page_ranges + files.append(entry) + payload = { + "files": files, "model_version": model, + "enable_formula": opts.enable_formula, "enable_table": opts.enable_table, + "language": opts.language, + } + if opts.extra_formats: + payload["extra_formats"] = list(opts.extra_formats) + data = _api_json("POST", f"{STANDARD_API}/file-urls/batch", token=token, + payload=payload, timeout=min(timeout, REQUEST_TIMEOUT_CAP)) + batch_id = data["batch_id"] + urls = data["file_urls"] + if len(urls) != len(batch_jobs): + raise MinerUError( + f"Batch upload URL count mismatch: requested {len(batch_jobs)}, got {len(urls)}" + ) + deadline = time.monotonic() + timeout + uploads = [] + for job, url in zip(batch_jobs, urls): + if isinstance(url, dict): + url = url.get("url") or url.get("file_url") or url.get("upload_url") + if not url: + raise MinerUError(f"Missing upload URL for {job.file_name or job.source}") + job.poll_kind, job.poll_id = "batch", batch_id + job.result.task_id = batch_id + job.deadline = deadline + uploads.append((job, url)) + return uploads + + +def _reserve_url_batch(batch_jobs, opts, token, timeout) -> list: + """Submit one Standard ``/extract/task/batch`` for URL inputs.""" + model = _standard_model(opts, batch_jobs[0].source) + files = [{"url": job.source, "data_id": job.data_id} for job in batch_jobs] + payload = {"files": files, "model_version": model} + data = _api_json("POST", f"{STANDARD_API}/extract/task/batch", token=token, + payload=payload, timeout=min(timeout, REQUEST_TIMEOUT_CAP)) + batch_id = data["batch_id"] + deadline = time.monotonic() + timeout + for job in batch_jobs: + job.poll_kind, job.poll_id = "batch", batch_id + job.result.task_id = batch_id + job.deadline = deadline + return [] + + +def _can_batch_standard_urls(opts: ParseOptions) -> bool: + """URL batch endpoint has a narrower documented payload than single URL tasks.""" + return ( + not opts.is_ocr + and opts.enable_formula + and opts.enable_table + and opts.language == "ch" + and not opts.page_ranges + and not opts.extra_formats + ) + + +def _chunk_standard_jobs(std_jobs, opts, batch_size) -> list: + """Group Standard-API jobs into batches, splitting by model_version.""" + by_model: dict = {} + for job in std_jobs: + by_model.setdefault(_standard_model(opts, job.source), []).append(job) + chunks = [] + size = max(1, min(batch_size, BATCH_MAX_FILES)) + for group in by_model.values(): + for i in range(0, len(group), size): + chunks.append(group[i:i + size]) + return chunks + + +def _chunk_standard_files(std_jobs, opts, batch_size) -> list: + """Backward-compatible wrapper for tests/importers.""" + return _chunk_standard_jobs(std_jobs, opts, batch_size) + + +def _poll_group(kind, poll_id, group, token, *, timeout=60): + """Poll one ticket; return ``(completed_jobs, failed_jobs)`` (state set on each).""" + completed, failed = [], [] + if kind == "agent": + data = _api_json("GET", f"{AGENT_API}/parse/{poll_id}", timeout=timeout) + job = group[0] + state = data.get("state") + if state == STATE_DONE: + job.download_url, job.download_kind = data["markdown_url"], "md" + completed.append(job) + elif state == STATE_FAILED: + job.result.error = _result_error(data) + failed.append(job) + elif kind == "task": + data = _api_json("GET", f"{STANDARD_API}/extract/task/{poll_id}", token=token, + timeout=timeout) + job = group[0] + state = data.get("state") + if state == STATE_DONE: + job.download_url, job.download_kind = data["full_zip_url"], "zip" + completed.append(job) + elif state == STATE_FAILED: + job.result.error = _result_error(data) + failed.append(job) + else: # batch — one GET reports every file in the batch + data = _api_json("GET", f"{STANDARD_API}/extract-results/batch/{poll_id}", + token=token, timeout=timeout) + by_data, by_name = {}, {} + for entry in data.get("extract_result", []): + if entry.get("data_id"): + by_data[entry["data_id"]] = entry + if entry.get("file_name"): + by_name.setdefault(entry["file_name"], entry) + group_name_counts = {} + for job in group: + group_name_counts[job.file_name] = group_name_counts.get(job.file_name, 0) + 1 + for job in group: + entry = by_data.get(job.data_id) + if entry is None and group_name_counts.get(job.file_name) == 1: + entry = by_name.get(job.file_name) + if not entry: + continue + state = entry.get("state") + if state == STATE_DONE: + job.download_url, job.download_kind = entry["full_zip_url"], "zip" + completed.append(job) + elif state == STATE_FAILED: + job.result.error = _result_error(entry) + failed.append(job) + return completed, failed + + +def _download_and_write(job, opts, output_dir, *, obsidian, want_markdown, timeout=300): + """Download a completed result, write it, and finalize the job's ParseResult.""" + try: + if job.download_kind == "md": + markdown = _download(job.download_url, timeout=timeout).decode("utf-8", errors="replace") + md_path = write_markdown(job.out_stem, markdown, output_dir) + if want_markdown: + job.result.markdown = markdown + else: + target_dir = output_dir / job.out_stem + target_dir.mkdir(parents=True, exist_ok=True) + zip_path = target_dir / "._result.zip.partial" + _download_to_path(job.download_url, zip_path, timeout=timeout) + md_path = extract_zip_path(job.out_stem, zip_path, output_dir) + try: + zip_path.unlink() + except OSError: + pass + if want_markdown and md_path.exists(): + job.result.markdown = md_path.read_text(encoding="utf-8", errors="replace") + _finalize(job.result, job.out_stem, output_dir, md_path, obsidian, + started=job.started or None) + except Exception as exc: # noqa: BLE001 - isolate a single bad result + job.result.state = STATE_FAILED + job.result.error = f"{type(exc).__name__}: {exc}" + + +def _poll_until_done(active, opts, token, output_dir, *, poll_interval, obsidian, + want_markdown, download_pool, on_done, timeout=60): + """Decoupled poll loop: dispatch downloads as results complete. + + Each batch/task ticket carries its OWN adaptive backoff and next-due time, so + one batch's cadence never resets or inflates another's. The per-request socket + timeout is capped (a stalled poll can't wedge the loop for the whole parse + budget — that lives in ``job.deadline``), and any unexpected error in one + ticket fails only that ticket instead of aborting the whole batch. + """ + poll_req_timeout = min(timeout, REQUEST_TIMEOUT_CAP) + now0 = time.monotonic() + groups: dict = {} # poll_id -> {kind, jobs, interval, due} + for job in active: + st = groups.setdefault( + job.poll_id, {"kind": job.poll_kind, "jobs": [], "interval": poll_interval, "due": now0} + ) + st["jobs"].append(job) + dl_futures = [] + while groups: + now = time.monotonic() + due_ids = [pid for pid, st in groups.items() if st["due"] <= now] + if not due_ids: + time.sleep(max(0.0, min(st["due"] for st in groups.values()) - now)) + continue + for poll_id in due_ids: + st = groups[poll_id] + group = st["jobs"] + try: + completed, failed = _poll_group(st["kind"], poll_id, group, token, timeout=poll_req_timeout) + except MinerUError as exc: + completed = [] + if _is_retryable_api_error(exc): + failed = [] # transient poll error — try again next cycle + else: + failed = list(group) + for job in failed: + job.result.error = str(exc) + except Exception as exc: # noqa: BLE001 - a malformed ticket must not abort the batch + completed = [] + failed = list(group) + for job in failed: + job.result.error = f"poll error: {type(exc).__name__}: {exc}" + for job in group: + if job in completed or job in failed: + continue + if now > job.deadline: + job.result.state = STATE_FAILED + job.result.error = "parse timed out" + failed.append(job) + for job in completed: + group.remove(job) + dl_futures.append( + download_pool.submit( + _download_and_write, job, opts, output_dir, + obsidian=obsidian, want_markdown=want_markdown, timeout=timeout, + ) + ) + for job in failed: + group.remove(job) + on_done(job) + progressed = bool(completed or failed) + if not group: + del groups[poll_id] + else: + st["interval"] = _next_poll_interval(st["interval"], progressed=progressed, base=poll_interval) + st["due"] = now + st["interval"] + for future in dl_futures: + future.result() # _download_and_write swallows its own errors into the result + # Notify every resolved job after downloads finish (the guard de-dupes earlier + # poll-failure notifications); includes jobs that failed *inside* the download + # step, so none are silently left unreported. + for job in active: + on_done(job) + + +def run_pipeline(sources, opts, *, token, output_dir, api, resume, poll_interval, + timeout, batch_size, workers, obsidian=None, want_markdown=False, + on_result=None) -> list: + """Parse many inputs with decoupled submit/poll/download. Returns ParseResults.""" + batch_mode = len(sources) > 1 + jobs = _plan_jobs(sources, opts, token=token, output_dir=output_dir, + api=api, resume=resume, batch_mode=batch_mode) + started = time.monotonic() + for job in jobs: + job.started = started + + def _notify(job): + if on_result is not None and not getattr(job, "_notified", False): + job._notified = True + on_result(job.result) + + for job in jobs: + if job.finished: # resume-skipped + _notify(job) + + pending = [j for j in jobs if not j.finished] + std_files = [j for j in pending if j.api == "standard" and not j.is_url] + std_urls = [j for j in pending if j.api == "standard" and j.is_url] + batchable_urls = std_urls if _can_batch_standard_urls(opts) else [] + single_urls = [] if _can_batch_standard_urls(opts) else std_urls + single_url_ids = {id(j) for j in single_urls} + singles = [ + j for j in pending + if j.api != "standard" or (j.is_url and id(j) in single_url_ids) + ] + + if not token: + for job in pending: + if job.api == "standard": + job.result.state = STATE_FAILED + job.result.error = ("Standard API needs a token — set MINERU_TOKEN " + "(https://mineru.net/apiManage/token)") + job.finished = True + _notify(job) + std_files = [j for j in std_files if not j.finished] + batchable_urls = [j for j in batchable_urls if not j.finished] + single_urls = [j for j in single_urls if not j.finished] + singles = [j for j in singles if not j.finished] + + file_batches = _chunk_standard_jobs(std_files, opts, batch_size) + url_batches = _chunk_standard_jobs(batchable_urls, opts, batch_size) + + quota_tripped = threading.Event() + + def _reserve_guarded(fn, *fn_args): + # Daily quota / retry-limit is terminal for the whole run; once tripped, + # skip the remaining submits instead of firing more doomed requests. + if quota_tripped.is_set(): + raise MinerUError( + f"daily quota reached ({FREE_DAILY_PAGES} free pages/day) — submit skipped; retry tomorrow", + code=-60018, + ) + try: + return fn(*fn_args) + except MinerUError as exc: + if exc.code in QUOTA_EXHAUSTED_CODES: + quota_tripped.set() + raise + + with ThreadPoolExecutor(max_workers=workers) as submit_pool, \ + ThreadPoolExecutor(max_workers=workers) as download_pool: + # Phase 1 — reserve (parallel): one POST per single input / per batch. + reserve = {} + for job in singles: + reserve[submit_pool.submit(_reserve_guarded, _reserve_single, job, opts, token, timeout)] = ("single", job) + for chunk in file_batches: + reserve[submit_pool.submit(_reserve_guarded, _reserve_batch, chunk, opts, token, timeout)] = ("batch", chunk) + for chunk in url_batches: + reserve[submit_pool.submit(_reserve_guarded, _reserve_url_batch, chunk, opts, token, timeout)] = ("batch", chunk) + + uploads = [] + for future in as_completed(reserve): + kind, payload = reserve[future] + affected = payload if kind == "batch" else [payload] + try: + uploads.extend(future.result()) + except Exception as exc: # noqa: BLE001 - submit failure isolated per ticket + msg = str(exc) if isinstance(exc, MinerUError) else f"{type(exc).__name__}: {exc}" + for job in affected: + if job.result.state != STATE_DONE: + job.result.state = STATE_FAILED + job.result.error = f"submit failed: {msg}" + job.finished = True + _notify(job) + + # Phase 2 — upload (parallel): PUT each reserved file to its signed URL. + up_futures = {} + for job, url in uploads: + up_futures[submit_pool.submit(_put_file, url, job.source, timeout)] = job + for future in as_completed(up_futures): + job = up_futures[future] + try: + future.result() + except Exception as exc: # noqa: BLE001 + job.result.state = STATE_FAILED + job.result.error = f"upload failed: {exc}" + job.finished = True + _notify(job) + + # Phase 3 — poll + download (decoupled). Gate on the ``finished`` flag, NOT + # result.state: a freshly submitted job's ParseResult still carries its default + # ``failed`` state, so a state-based filter would drop every in-flight job and + # skip polling entirely — reporting the whole batch as failed. + active = [j for j in pending if j.poll_id and not j.finished] + if active: + _poll_until_done( + active, opts, token, output_dir, poll_interval=poll_interval, + obsidian=obsidian, want_markdown=want_markdown, + download_pool=download_pool, on_done=_notify, timeout=timeout, + ) + + return [job.result for job in jobs] + + +# --------------------------------------------------------------------------- # +# Input expansion + CLI +# --------------------------------------------------------------------------- # +_IMG_REF = re.compile(r"(!\[[^\]]*\]\()([^)\s]+)(\))") + + +def _load_splitter(): + script_dir = str(Path(__file__).resolve().parent) + if script_dir not in sys.path: + sys.path.insert(0, script_dir) + import splitter + return splitter + + +def _load_local_engine(): + """Import the optional offline-engine module; return it or None.""" + try: + script_dir = str(Path(__file__).resolve().parent) + if script_dir not in sys.path: + sys.path.insert(0, script_dir) + import local_engine + return local_engine + except Exception: # pragma: no cover + return None + + +def split_cap(token, api, override=None) -> int: + """Pages per part: explicit override, else the cap of the backend that will run.""" + if override: + return override + if api == "standard" or (api == "auto" and token): + return STANDARD_MAX_PAGES + return AGENT_MAX_PAGES + + +def _merge_parts(part_results, stem: str, final_dir: Path) -> tuple: + """Merge part Markdown + images into ``final_dir``; return (markdown, image_count).""" + images_dir = final_dir / "images" + bodies = [] + image_count = 0 + for n, res in enumerate(part_results, start=1): + if res.state != STATE_DONE or not res.markdown: + continue + part_md_dir = Path(res.markdown_path).parent if res.markdown_path else None + prefix = f"part{n:03d}_" + + def repl(match, _dir=part_md_dir, _pfx=prefix): + nonlocal image_count + ref = match.group(2) + if ref.startswith("http://") or ref.startswith("https://") or _dir is None: + return match.group(0) + src = (_dir / ref) + if not src.is_file(): + return match.group(0) + images_dir.mkdir(parents=True, exist_ok=True) + new_name = _pfx + Path(ref).name + (images_dir / new_name).write_bytes(src.read_bytes()) + image_count += 1 + return f"{match.group(1)}images/{new_name}{match.group(3)}" + + bodies.append(_IMG_REF.sub(repl, res.markdown)) + merged = ("\n\n---\n\n").join(bodies) + final_dir.mkdir(parents=True, exist_ok=True) + (final_dir / f"{stem}.md").write_text(merged, encoding="utf-8") + return merged, image_count + + +def process_split(source, opts, *, token, output_dir, api, resume, timeout, cap, engine="cloud", out_stem=None): + """Split an oversized local PDF, parse each part, and merge. Returns a ParseResult, + or None when no split is needed (caller falls back to process_one).""" + if is_url(source) or suffix_of(source) != ".pdf": + return None + stem = safe_stem(source) + out_stem = out_stem or stem + result = ParseResult(name=stem, source=source, modality="pdf") + try: + splitter = _load_splitter() + pages = splitter.pdf_page_count(source) + except Exception as exc: # SplitError (pypdf missing) or unreadable PDF + result.state = STATE_FAILED + result.error = str(exc) + return result + if pages <= cap: + return None # fits — let normal processing handle it + + final_dir = output_dir / out_stem + if resume and (final_dir / f"{out_stem}.md").exists(): + result.state = "skipped" + result.output_dir = str(final_dir) + result.markdown_path = str(final_dir / f"{out_stem}.md") + return result + + started = time.monotonic() + with tempfile.TemporaryDirectory() as tmp: + parts = splitter.split_pdf(source, cap, tmp) + part_out = Path(tmp) / "out" + part_results = [ + process_one(str(p), opts, token=token, output_dir=part_out, + api=api, resume=False, timeout=timeout, engine=engine) + for p in parts + ] + failed = [r for r in part_results if r.state == STATE_FAILED] + if failed: + result.state = STATE_FAILED + result.error = f"part failed: {failed[0].error}" + return result + merged, n_images = _merge_parts(part_results, out_stem, final_dir) + + result.state = STATE_DONE + result.api = part_results[0].api if part_results else api + result.output_dir = str(final_dir) + result.markdown_path = str(final_dir / f"{out_stem}.md") + result.markdown = merged + result.elapsed = round(time.monotonic() - started, 2) + result.task_id = f"split:{len(parts)}parts" + return result + + +def expand_inputs(raw_inputs) -> list: + """Expand directories into supported files; pass through URLs and files. + + De-duplicates while preserving order: identical files (by resolved real path) + and repeated URLs collapse to one, so a file passed twice — or matched by both + an explicit path and a directory scan — is parsed once, not N times. + """ + expanded, seen = [], set() + + def _add(display, key): + if key not in seen: + seen.add(key) + expanded.append(display) + + for item in raw_inputs: + if is_url(item): + _add(item, item) + continue + path = Path(item) + if path.is_dir(): + for child in sorted(path.iterdir()): + if child.is_file() and is_supported(child.name): + _add(str(child), str(child.resolve())) + else: + key = str(path.resolve()) if path.exists() else str(path) + _add(item, key) + return expanded + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="mineru", + description="Parse PDF / Office / image files into Markdown via MinerU.", + ) + parser.add_argument("inputs", nargs="*", help="File(s), a directory, or a URL") + parser.add_argument("--output", "-o", default="./output", help="Output directory (default: ./output)") + parser.add_argument("--token", help="MinerU API token (or set MINERU_TOKEN)") + parser.add_argument("--api", choices=["auto", "agent", "standard"], default="auto", + help="Cloud backend selection (default: auto)") + parser.add_argument("--engine", choices=["cloud", "local", "auto"], default="cloud", + help="cloud (MinerU API) | local (offline pymupdf4llm, born-digital PDFs) | " + "auto (local if the PDF has text, else cloud). Default: cloud") + parser.add_argument("--model", choices=["pipeline", "vlm", "MinerU-HTML"], default="vlm", + help="Standard API model (default: vlm)") + parser.add_argument("--format", dest="formats", action="append", default=[], + choices=["docx", "html", "latex"], help="Extra export format (repeatable; forces Standard API)") + parser.add_argument("--lang", default="ch", help="Document language code (default: ch)") + parser.add_argument("--ocr", action="store_true", help="Enable OCR for scanned documents") + parser.add_argument("--no-formula", action="store_true", help="Disable formula recognition") + parser.add_argument("--no-table", action="store_true", help="Disable table recognition") + parser.add_argument("--pages", help="Page range, e.g. '1-10' or '2,4-6' (Standard only)") + parser.add_argument("--workers", "-w", type=int, default=DEFAULT_WORKERS, + help=f"Concurrent submit/upload/download slots (default: {DEFAULT_WORKERS})") + parser.add_argument("--batch-size", type=int, default=BATCH_MAX_FILES, + help=f"Max files per Standard batch submit (default/max: {BATCH_MAX_FILES})") + parser.add_argument("--poll-interval", type=float, default=DEFAULT_POLL_INTERVAL, + help=f"Seconds between status polls (default: {DEFAULT_POLL_INTERVAL}, adaptive backoff)") + parser.add_argument("--resume", action="store_true", help="Skip inputs already parsed") + parser.add_argument("--obsidian", help="Shortcut for --to obsidian with this vault path") + parser.add_argument("--to", dest="to", action="append", default=[], metavar="SINK", + help="Deliver parsed Markdown to a content tool (repeatable): " + "obsidian, logseq, siyuan, notion, linear, yuque, coda, slack, " + "feishu, confluence, onenote, ticktick, dingtalk, airtable, wecom") + parser.add_argument("--list-sinks", action="store_true", help="List available delivery targets and exit") + parser.add_argument("--doctor", action="store_true", help="Run an environment self-check and exit") + parser.add_argument("--chunk", action="store_true", help="Also emit heading-aware RAG chunks (JSON sidecar + --json)") + parser.add_argument("--chunk-size", type=int, default=2000, help="Max characters per chunk (default: 2000)") + parser.add_argument("--split", action="store_true", + help="Split oversized PDFs past the page caps, parse parts, merge (needs pypdf)") + parser.add_argument("--split-pages", type=int, help="Pages per split part (default: backend cap, 20 or 200)") + parser.add_argument("--stdout", action="store_true", help="Print Markdown to stdout (single input)") + parser.add_argument("--json", dest="as_json", action="store_true", help="Print machine-readable status to stdout") + parser.add_argument("--timeout", type=int, default=600, help="Per-input timeout in seconds (default: 600)") + parser.add_argument("--quiet", "-q", action="store_true", help="Suppress progress output") + parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}") + return parser + + +def options_from_args(args) -> ParseOptions: + return ParseOptions( + model=args.model, + language=args.lang, + is_ocr=args.ocr, + enable_formula=not args.no_formula, + enable_table=not args.no_table, + page_ranges=args.pages, + extra_formats=tuple(args.formats), + ) + + +def _log(message, *, quiet): + if not quiet: + print(message, file=sys.stderr, flush=True) + + +def _load_sinks(): + """Import the optional ``sinks`` delivery package; return the module or None.""" + try: + script_dir = str(Path(__file__).resolve().parent) + if script_dir not in sys.path: + sys.path.insert(0, script_dir) + import sinks + return sinks + except Exception: # pragma: no cover - sinks are optional + return None + + +def _print_sinks() -> int: + sinks = _load_sinks() + if sinks is None: + print("Delivery sinks unavailable (scripts/sinks not importable).", file=sys.stderr) + return 1 + print("Available delivery targets (use --to NAME, repeatable):\n") + for name in sinks.sink_names(): + sink = sinks.get_sink(name) + req = ", ".join(sink.requires) if sink.requires else "(no config needed)" + print(f" {name:11} — {sink.label}\n{'':14}env: {req}") + return 0 + + +def _check_network() -> tuple: + try: + status, _ = _http("GET", "https://mineru.net/", timeout=8) + return True, f"reachable (HTTP {status})" + except Exception as exc: # noqa: BLE001 + return False, f"unreachable ({type(exc).__name__})" + + +def _check_token(token: str) -> tuple: + try: + _api_json("POST", f"{STANDARD_API}/extract/task", token=token, payload={}) + return True, "accepted" + except MinerUError as exc: + if exc.code in ("A0202", "A0211"): + return False, f"invalid/expired ({exc.code}) — refresh at https://mineru.net/apiManage/token" + return True, "accepted (token authenticates; a parameter error is expected here)" + except Exception as exc: # noqa: BLE001 + return False, f"check failed ({type(exc).__name__})" + + +def _module_present(module: str) -> bool: + import importlib.util + try: + return importlib.util.find_spec(module) is not None + except (ImportError, ValueError): + return False + + +def _doctor(as_json: bool = False) -> int: + """Environment self-check: Python, API reachability, token, optional extras, sinks.""" + import platform + + py_ok = sys.version_info >= (3, 8) + net_ok, net_detail = _check_network() + token = os.environ.get("MINERU_TOKEN") + if not token: + tok_ok, tok_detail = True, "not set (Agent API works token-free)" + else: + tok_ok, tok_detail = _check_token(token) + + extras = { + "pypdf (--split)": _module_present("pypdf"), + "pymupdf4llm (--engine local)": _module_present("pymupdf4llm"), + "html-for-docx (wps sink)": _module_present("html4docx"), + "roam-client (roam sink)": _module_present("roam_client"), + } + + sinks = _load_sinks() + if sinks is not None: + names = sinks.sink_names() + configured = [n for n in names if sinks.get_sink(n).is_configured()] + sinks_detail = f"{len(names)} registered · {len(configured)} configured" + else: + sinks_detail = "unavailable" + + report = { + "version": __version__, + "python": {"ok": py_ok, "detail": platform.python_version()}, + "network": {"ok": net_ok, "detail": net_detail}, + "token": {"ok": tok_ok, "detail": tok_detail}, + "optional_extras": extras, + "sinks": sinks_detail, + "healthy": py_ok and net_ok and tok_ok, + } + + if as_json: + print(json.dumps(report, ensure_ascii=False, indent=2)) + else: + mark = lambda ok: "✅" if ok else "❌" # noqa: E731 + print(f"MinerU Skill doctor (v{__version__})\n") + print(f" {mark(py_ok)} Python {report['python']['detail']}") + print(f" {mark(net_ok)} MinerU API {net_detail}") + print(f" {mark(tok_ok)} MINERU_TOKEN {tok_detail}") + print(" · Optional extras:") + for label, present in extras.items(): + print(f" {mark(present)} {label}") + print(f" · Sinks: {sinks_detail}") + print(f"\n{'✅ healthy' if report['healthy'] else '❌ issues found'}") + return 0 if report["healthy"] else 1 + + +def _deliver(results, names, sinks, *, quiet): + """Deliver each completed result's Markdown to the requested sinks.""" + for res in results: + if res.state != STATE_DONE or not res.markdown: + continue + doc = sinks.ParsedDoc( + title=res.name, markdown=res.markdown, source=res.source, + modality=res.modality, markdown_path=res.markdown_path, + ) + for outcome in sinks.deliver_all(doc, names): + res.sinks.append(outcome.to_status()) + if outcome.ok: + _log(f" 📤 {res.name} → {outcome.sink}: {outcome.url or outcome.detail or 'ok'}", + quiet=quiet) + else: + _log(f" ⚠️ {res.name} → {outcome.sink}: {outcome.error}", quiet=quiet) + + +def _chunk_results(results, *, max_chars, quiet): + """Attach heading-aware RAG chunks to each result and write a JSON sidecar.""" + try: + script_dir = str(Path(__file__).resolve().parent) + if script_dir not in sys.path: + sys.path.insert(0, script_dir) + import chunking + except Exception: # pragma: no cover - chunking is stdlib, should always import + _log("⚠️ --chunk requested but the chunking module is unavailable.", quiet=quiet) + return + for res in results: + if res.state != STATE_DONE or not res.markdown: + continue + res.chunks = chunking.chunk_markdown(res.markdown, max_chars=max_chars, source=res.source) + if res.markdown_path: + sidecar = Path(res.markdown_path).with_suffix(".chunks.json") + try: + sidecar.write_text(json.dumps(res.chunks, ensure_ascii=False, indent=2), encoding="utf-8") + _log(f" 🧩 {res.name}: {len(res.chunks)} chunk(s) → {sidecar}", quiet=quiet) + except OSError: + pass + + +def main(argv=None) -> int: + args = build_parser().parse_args(argv) + + if args.doctor: + return _doctor(as_json=args.as_json) + + if args.list_sinks: + return _print_sinks() + + if args.obsidian: + os.environ["OBSIDIAN_VAULT"] = str(Path(args.obsidian).expanduser()) + if "obsidian" not in args.to: + args.to.append("obsidian") + + token = args.token or os.environ.get("MINERU_TOKEN") + opts = options_from_args(args) + output_dir = Path(args.output) + + sources = expand_inputs(args.inputs) + if not sources: + _log("No supported inputs found.", quiet=args.quiet) + return 1 + + unsupported = [s for s in sources if not is_url(s) and not is_supported(s)] + if unsupported: + _log(f"Unsupported file type(s): {', '.join(unsupported)}", quiet=args.quiet) + return 1 + + if (args.stdout or args.as_json) and len(sources) > 1: + # Keep stdout machine-clean: route progress to stderr only (already does). + pass + + workers = max(1, min(args.workers, len(sources))) + _log( + f"📚 {len(sources)} input(s) · workers={workers} · " + f"{'token set' if token else 'no token (Agent API)'}", + quiet=args.quiet, + ) + + cap = split_cap(token, args.api, args.split_pages) + + def log_result(res): + icon = {"done": "✅", "skipped": "⏭️", "failed": "❌"}.get(res.state, "•") + timing = f" ({res.elapsed}s)" if res.elapsed else "" + _log( + f" {icon} [{res.api}/{res.modality}] {res.name}{timing}" + + (f" — {res.error}" if res.error else ""), + quiet=args.quiet, + ) + + results: list = [] + use_pipeline = args.engine == "cloud" and not args.split and len(sources) > 1 + + if use_pipeline: + results = run_pipeline( + sources, + opts, + token=token, + output_dir=output_dir, + api=args.api, + resume=args.resume, + poll_interval=args.poll_interval, + timeout=args.timeout, + batch_size=args.batch_size, + workers=workers, + want_markdown=bool(args.to) or args.chunk or args.stdout, + on_result=log_result, + ) + else: + out_stem_of = dict(zip(sources, unique_out_stems(sources))) + + def run(source): + res = None + if args.split and args.engine != "local": + res = process_split( + source, opts, token=token, output_dir=output_dir, api=args.api, + resume=args.resume, timeout=args.timeout, cap=cap, engine=args.engine, + out_stem=out_stem_of[source], + ) + if res is None: + res = process_one( + source, opts, token=token, output_dir=output_dir, api=args.api, + obsidian=None, resume=args.resume, timeout=args.timeout, engine=args.engine, + out_stem=out_stem_of[source], + ) + log_result(res) + return res + + if workers == 1: + results = [run(s) for s in sources] + else: + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = {pool.submit(run, s): s for s in sources} + for future in as_completed(futures): + results.append(future.result()) + + done = [r for r in results if r.state == STATE_DONE] + skipped = [r for r in results if r.state == "skipped"] + failed = [r for r in results if r.state == STATE_FAILED] + + if args.to: + sinks = _load_sinks() + if sinks is None: + _log("⚠️ --to requested but the sinks package is unavailable.", quiet=args.quiet) + else: + _deliver(results, args.to, sinks, quiet=args.quiet) + + if args.chunk: + _chunk_results(results, max_chars=args.chunk_size, quiet=args.quiet) + + if args.as_json: + print(json.dumps({ + "total": len(results), + "done": len(done), + "skipped": len(skipped), + "failed": len(failed), + "results": [r.to_status() for r in results], + }, ensure_ascii=False, indent=2)) + elif args.stdout: + for r in results: + if r.markdown is not None: + print(r.markdown) + + _log( + f"\n{'='*48}\n✅ {len(done)} · ⏭️ {len(skipped)} · ❌ {len(failed)}" + + (f"\n📁 {output_dir}" if done else ""), + quiet=args.quiet, + ) + if failed: + _log("Failed: " + ", ".join(f"{r.name} ({r.error})" for r in failed), quiet=args.quiet) + return 1 if failed else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/developing/mineru/scripts/mineru_mcp.py b/skills/developing/mineru/scripts/mineru_mcp.py new file mode 100644 index 0000000..502b2e5 --- /dev/null +++ b/skills/developing/mineru/scripts/mineru_mcp.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +"""Zero-dependency MCP server (stdio) for MinerU Skill. + +Speaks newline-delimited JSON-RPC 2.0 over stdin/stdout using only the standard +library, so an MCP host (Claude, Cursor, Windsurf, ...) can call MinerU. Register: + + {"command": "python3", "args": ["scripts/mineru_mcp.py"]} + +Tools: ``mineru_parse``, ``mineru_parse_to``, ``mineru_list_sinks``. +""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +import mineru # noqa: E402 + +PROTOCOL_VERSION = "2024-11-05" +SERVER_INFO = {"name": "mineru", "version": mineru.__version__} + +TOOLS = [ + { + "name": "mineru_parse", + "description": "Parse a PDF / Office / image file or URL into clean Markdown via MinerU.", + "inputSchema": { + "type": "object", + "properties": { + "input": {"type": "string", "description": "Local file path or http(s) URL"}, + "output_dir": {"type": "string", "description": "Where to write output (default ./output)"}, + "api": {"type": "string", "enum": ["auto", "agent", "standard"]}, + "engine": {"type": "string", "enum": ["cloud", "local", "auto"]}, + "ocr": {"type": "boolean"}, + "lang": {"type": "string"}, + }, + "required": ["input"], + }, + }, + { + "name": "mineru_parse_to", + "description": "Parse a document and deliver the Markdown into content tools (Obsidian, Notion, Feishu, ...).", + "inputSchema": { + "type": "object", + "properties": { + "input": {"type": "string"}, + "sinks": {"type": "array", "items": {"type": "string"}, "description": "Sink names, e.g. ['obsidian','notion']"}, + "output_dir": {"type": "string"}, + }, + "required": ["input", "sinks"], + }, + }, + { + "name": "mineru_list_sinks", + "description": "List available delivery targets and their required environment variables.", + "inputSchema": {"type": "object", "properties": {}}, + }, +] + + +class MethodNotFound(Exception): + pass + + +def _text_result(text: str, is_error: bool = False) -> dict: + return {"content": [{"type": "text", "text": text}], "isError": is_error} + + +def _tool_parse(args: dict) -> dict: + opts = mineru.ParseOptions(is_ocr=bool(args.get("ocr")), language=args.get("lang", "ch")) + token = os.environ.get("MINERU_TOKEN") + output_dir = Path(args.get("output_dir") or "./output") + res = mineru.process_one( + args["input"], opts, token=token, output_dir=output_dir, + api=args.get("api", "auto"), engine=args.get("engine", "cloud"), + ) + if res.state == "done": + return _text_result(res.markdown or "") + return _text_result(f"Parse failed: {res.error}", is_error=True) + + +def _tool_parse_to(args: dict) -> dict: + opts = mineru.ParseOptions() + token = os.environ.get("MINERU_TOKEN") + output_dir = Path(args.get("output_dir") or "./output") + res = mineru.process_one(args["input"], opts, token=token, output_dir=output_dir) + if res.state != "done": + return _text_result(f"Parse failed: {res.error}", is_error=True) + sinks = mineru._load_sinks() + if sinks is None: + return _text_result("Sinks package unavailable.", is_error=True) + doc = sinks.ParsedDoc(title=res.name, markdown=res.markdown, source=res.source, + modality=res.modality, markdown_path=res.markdown_path) + outcomes = [o.to_status() for o in sinks.deliver_all(doc, args["sinks"])] + any_fail = any(not o["ok"] for o in outcomes) + return _text_result(json.dumps({"name": res.name, "deliveries": outcomes}, ensure_ascii=False, indent=2), + is_error=any_fail) + + +def _tool_list_sinks(_args: dict) -> dict: + sinks = mineru._load_sinks() + if sinks is None: + return _text_result("Sinks package unavailable.", is_error=True) + listing = [{"name": n, "label": sinks.get_sink(n).label, "requires": list(sinks.get_sink(n).requires)} + for n in sinks.sink_names()] + return _text_result(json.dumps(listing, ensure_ascii=False, indent=2)) + + +_TOOL_HANDLERS = { + "mineru_parse": _tool_parse, + "mineru_parse_to": _tool_parse_to, + "mineru_list_sinks": _tool_list_sinks, +} + + +def _route(method: str, params: dict): + if method == "initialize": + return {"protocolVersion": PROTOCOL_VERSION, "capabilities": {"tools": {}}, "serverInfo": SERVER_INFO} + if method == "tools/list": + return {"tools": TOOLS} + if method == "tools/call": + name = params.get("name") + handler = _TOOL_HANDLERS.get(name) + if handler is None: + return _text_result(f"Unknown tool: {name}", is_error=True) + try: + return handler(params.get("arguments") or {}) + except Exception as exc: # noqa: BLE001 - report as a tool error, never crash the server + return _text_result(f"{type(exc).__name__}: {exc}", is_error=True) + raise MethodNotFound(method) + + +def dispatch(request: dict): + """Handle one JSON-RPC request dict; return a response dict, or None for notifications.""" + is_notification = "id" not in request + req_id = request.get("id") + try: + result = _route(request.get("method"), request.get("params") or {}) + except MethodNotFound as exc: + if is_notification: + return None + return {"jsonrpc": "2.0", "id": req_id, "error": {"code": -32601, "message": f"Method not found: {exc}"}} + except Exception as exc: # noqa: BLE001 + if is_notification: + return None + return {"jsonrpc": "2.0", "id": req_id, "error": {"code": -32603, "message": str(exc)}} + if is_notification: + return None + return {"jsonrpc": "2.0", "id": req_id, "result": result} + + +def serve(stdin=None, stdout=None) -> None: + """Read newline-delimited JSON-RPC from stdin, write responses to stdout.""" + stdin = stdin or sys.stdin + stdout = stdout or sys.stdout + for line in stdin: + line = line.strip() + if not line: + continue + try: + request = json.loads(line) + except ValueError: + continue + response = dispatch(request) + if response is not None: + stdout.write(json.dumps(response, ensure_ascii=False) + "\n") + stdout.flush() + + +def main() -> int: + serve() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/developing/mineru/scripts/sinks/__init__.py b/skills/developing/mineru/scripts/sinks/__init__.py new file mode 100644 index 0000000..28802d1 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/__init__.py @@ -0,0 +1,75 @@ +"""Pluggable delivery sinks for parsed Markdown. + +Each submodule registers one or more :class:`Sink` implementations that deliver a +:class:`ParsedDoc` into a content tool using that tool's official ingestion path. +Importing this package populates the registry; a sink module that fails to import +is recorded in :data:`IMPORT_ERRORS` rather than breaking the others. +""" + +from __future__ import annotations + +import importlib +import sys + +from .base import ( # noqa: F401 + ParsedDoc, + Sink, + SinkError, + SinkResult, + get_sink, + sink_names, + REGISTRY, +) + +# Sink modules to load. Order is cosmetic. +_MODULES = [ + "local", # obsidian, logseq (filesystem) + "siyuan", + "notion", + "linear", + "yuque", + "coda", + "ticktick", + "dingtalk", + "airtable", + "wecom", + "slack", + "feishu", + "confluence", + "onenote", + "roam", # optional dependency (roam-client) + "wps", # optional dependency (html-for-docx) +] + +IMPORT_ERRORS: dict = {} + +for _name in _MODULES: + try: + importlib.import_module(f"{__name__}.{_name}") + except Exception as exc: # noqa: BLE001 - a bad sink shouldn't break the rest + IMPORT_ERRORS[_name] = f"{type(exc).__name__}: {exc}" + print(f"[sinks] failed to load {_name}: {exc}", file=sys.stderr) + + +def deliver_all(doc: ParsedDoc, names) -> list: + """Deliver ``doc`` to each named sink, returning a list of :class:`SinkResult`.""" + results = [] + for name in names: + sink = get_sink(name) + if sink is None: + results.append(SinkResult(sink=name, ok=False, error=f"unknown sink '{name}'")) + continue + missing = sink.missing_config() + if missing: + results.append(SinkResult( + sink=sink.name, ok=False, + error=f"missing config: {', '.join(missing)}", + )) + continue + try: + results.append(sink.deliver(doc)) + except SinkError as exc: + results.append(SinkResult(sink=sink.name, ok=False, error=str(exc))) + except Exception as exc: # noqa: BLE001 - surface but never crash the run + results.append(SinkResult(sink=sink.name, ok=False, error=f"{type(exc).__name__}: {exc}")) + return results diff --git a/skills/developing/mineru/scripts/sinks/_http.py b/skills/developing/mineru/scripts/sinks/_http.py new file mode 100644 index 0000000..4f14949 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/_http.py @@ -0,0 +1,72 @@ +"""Zero-dependency HTTP helpers shared by all sinks (stdlib urllib only). + +``http_request`` is the single seam tests monkeypatch. +""" + +from __future__ import annotations + +import json +import mimetypes +import urllib.error +import urllib.request +from typing import Optional + +USER_AGENT = "MinerU-Skill-sink/1.0" + + +def http_request(method, url, *, headers=None, data=None, timeout=60): + """Perform one HTTP request. Returns ``(status_code, body_bytes)``.""" + req = urllib.request.Request(url, data=data, method=method, headers=headers or {}) + req.add_header("User-Agent", USER_AGENT) + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: + return resp.getcode(), resp.read() + except urllib.error.HTTPError as exc: + body = exc.read() if hasattr(exc, "read") else b"" + return exc.code, body + + +def request_json(method, url, *, headers=None, payload=None, timeout=60): + """JSON request helper. Returns ``(status_code, parsed_json_or_empty_dict)``.""" + hdrs = dict(headers or {}) + body = None + if payload is not None: + hdrs.setdefault("Content-Type", "application/json") + body = json.dumps(payload, ensure_ascii=False).encode("utf-8") + status, raw = http_request(method, url, headers=hdrs, data=body, timeout=timeout) + parsed: dict = {} + if raw: + try: + parsed = json.loads(raw.decode("utf-8")) + except (ValueError, UnicodeDecodeError): + parsed = {} + return status, parsed + + +def encode_multipart(fields=None, files=None): + """Build a ``multipart/form-data`` body with stdlib only. + + ``fields``: dict of str -> str. ``files``: list of (field_name, filename, bytes). + Returns ``(content_type, body_bytes)``. + """ + boundary = "----MinerUSinkBoundary7MA4YWxkTrZu0gW" + crlf = b"\r\n" + parts = [] + for name, value in (fields or {}).items(): + parts.append(b"--" + boundary.encode()) + parts.append(f'Content-Disposition: form-data; name="{name}"'.encode()) + parts.append(b"") + parts.append(str(value).encode("utf-8")) + for field_name, filename, content in files or []: + ctype = mimetypes.guess_type(filename)[0] or "application/octet-stream" + parts.append(b"--" + boundary.encode()) + parts.append( + f'Content-Disposition: form-data; name="{field_name}"; filename="{filename}"'.encode() + ) + parts.append(f"Content-Type: {ctype}".encode()) + parts.append(b"") + parts.append(content) + parts.append(b"--" + boundary.encode() + b"--") + parts.append(b"") + body = crlf.join(parts) + return f"multipart/form-data; boundary={boundary}", body diff --git a/skills/developing/mineru/scripts/sinks/_md.py b/skills/developing/mineru/scripts/sinks/_md.py new file mode 100644 index 0000000..7dab4c4 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/_md.py @@ -0,0 +1,244 @@ +"""Small, dependency-free Markdown utilities used by sinks. + +These are intentionally pragmatic, not a full CommonMark implementation: they +cover the constructs MinerU emits (headings, emphasis, code, lists, tables, +blockquotes, links, images) well enough to deliver faithful content to tools +that require HTML (Confluence, OneNote) or an outline (Logseq). +""" + +from __future__ import annotations + +import html +import re +from pathlib import Path +from typing import Optional + +_IMAGE_RE = re.compile(r"!\[(?P[^\]]*)\]\((?P[^)\s]+)(?:\s+\"[^\"]*\")?\)") +_ILLEGAL_FS = re.compile(r'[\\/:*?"<>|#^\[\]]+') + + +def slugify(text: str, default: str = "document") -> str: + """Filesystem/URL-safe slug.""" + text = text.strip().lower() + text = re.sub(r"[\s_]+", "-", text) + text = re.sub(r"[^a-z0-9\-]+", "", text) + text = re.sub(r"-{2,}", "-", text).strip("-") + return text or default + + +def safe_filename(title: str, default: str = "document") -> str: + """Clean a title into a safe note filename (keeps unicode, drops illegal chars).""" + name = _ILLEGAL_FS.sub(" ", title).strip() + name = re.sub(r"\s{2,}", " ", name) + return name[:120] or default + + +def is_remote(ref: str) -> bool: + return ref.startswith("http://") or ref.startswith("https://") or ref.startswith("data:") + + +def find_local_images(markdown: str, base_dir) -> list: + """Return ``[(alt, ref, Path)]`` for image refs that point at existing local files.""" + base = Path(base_dir) if base_dir else None + found = [] + seen = set() + for match in _IMAGE_RE.finditer(markdown): + ref = match.group("ref") + if is_remote(ref) or ref in seen: + continue + path = Path(ref) + if not path.is_absolute() and base is not None: + path = base / ref + if path.is_file(): + found.append((match.group("alt"), ref, path)) + seen.add(ref) + return found + + +def rewrite_images(markdown: str, mapping: dict) -> str: + """Rewrite local image refs using ``{old_ref: new_ref}``.""" + def repl(match): + ref = match.group("ref") + if ref in mapping: + return f"![{match.group('alt')}]({mapping[ref]})" + return match.group(0) + + return _IMAGE_RE.sub(repl, markdown) + + +def yaml_frontmatter(props: dict) -> str: + """Render a YAML frontmatter block. List values become ``- item`` lines.""" + lines = ["---"] + for key, value in props.items(): + if value is None or value == "" or value == []: + continue + if isinstance(value, (list, tuple)): + lines.append(f"{key}:") + for item in value: + lines.append(f" - {item}") + else: + lines.append(f"{key}: {value}") + lines.append("---") + return "\n".join(lines) + + +# --------------------------------------------------------------------------- # +# Inline + block Markdown -> HTML (pragmatic, XHTML-safe) +# --------------------------------------------------------------------------- # +def _inline(text: str) -> str: + """Convert inline Markdown to HTML on already-escaped text.""" + # images first, then links + text = _IMAGE_RE.sub( + lambda m: f'{m.group(', + text, + ) + text = re.sub(r"\[([^\]]+)\]\(([^)\s]+)\)", + lambda m: f'{m.group(1)}', text) + text = re.sub(r"`([^`]+)`", r"\1", text) + text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) + text = re.sub(r"(?\1", text) + return text + + +def md_to_html(markdown: str) -> str: + """Convert a Markdown document to a pragmatic, XHTML-safe HTML fragment.""" + out = [] + lines = markdown.replace("\r\n", "\n").split("\n") + i = 0 + n = len(lines) + in_code = False + code_buf: list = [] + list_stack: list = [] # 'ul' / 'ol' + + def close_lists(): + while list_stack: + out.append(f"") + + while i < n: + line = lines[i] + fence = line.strip().startswith("```") + if fence and not in_code: + close_lists() + in_code = True + code_buf = [] + i += 1 + continue + if fence and in_code: + out.append("
" + html.escape("\n".join(code_buf)) + "
") + in_code = False + i += 1 + continue + if in_code: + code_buf.append(line) + i += 1 + continue + + stripped = line.strip() + if not stripped: + close_lists() + i += 1 + continue + + # table block + if "|" in stripped and i + 1 < n and re.match(r"^\s*\|?[\s:|-]+\|?\s*$", lines[i + 1]): + close_lists() + header = [c.strip() for c in stripped.strip("|").split("|")] + rows = [] + i += 2 + while i < n and "|" in lines[i] and lines[i].strip(): + rows.append([c.strip() for c in lines[i].strip().strip("|").split("|")]) + i += 1 + out.append("" + + "".join(f"" for c in header) + + "") + for row in rows: + out.append("" + "".join(f"" for c in row) + "") + out.append("
{_inline(html.escape(c))}
{_inline(html.escape(c))}
") + continue + + heading = re.match(r"^(#{1,6})\s+(.*)$", stripped) + if heading: + close_lists() + level = len(heading.group(1)) + out.append(f"{_inline(html.escape(heading.group(2)))}") + i += 1 + continue + + if stripped.startswith(">"): + close_lists() + out.append(f"
{_inline(html.escape(stripped[1:].strip()))}
") + i += 1 + continue + + if re.match(r"^([-*+])\s+", stripped): + if not list_stack or list_stack[-1] != "ul": + close_lists() + list_stack.append("ul") + out.append("
    ") + item = re.sub(r"^([-*+])\s+", "", stripped) + out.append(f"
  • {_inline(html.escape(item))}
  • ") + i += 1 + continue + + if re.match(r"^\d+\.\s+", stripped): + if not list_stack or list_stack[-1] != "ol": + close_lists() + list_stack.append("ol") + out.append("
      ") + item = re.sub(r"^\d+\.\s+", "", stripped) + out.append(f"
    1. {_inline(html.escape(item))}
    2. ") + i += 1 + continue + + if re.match(r"^([-*_])\1{2,}$", stripped): + close_lists() + out.append("
      ") + i += 1 + continue + + close_lists() + out.append(f"

      {_inline(html.escape(stripped))}

      ") + i += 1 + + if in_code: + out.append("
      " + html.escape("\n".join(code_buf)) + "
      ") + close_lists() + return "\n".join(out) + + +# --------------------------------------------------------------------------- # +# Markdown -> Logseq outline +# --------------------------------------------------------------------------- # +def md_to_logseq(markdown: str, properties: Optional[dict] = None) -> str: + """Convert flat Markdown into a Logseq outline. + + Every line becomes a ``- `` block. Headings are top-level blocks; the content + that follows a heading nests one level beneath it. Page properties + (``key:: value``) go on the first block, as Logseq requires. + """ + out = [] + if properties: + prop_lines = [] + for key, value in properties.items(): + if not value: + continue + if isinstance(value, (list, tuple)): + value = ", ".join(str(v) for v in value) + prop_lines.append(f"{key}:: {value}") + if prop_lines: + out.append("- " + prop_lines[0]) + out.extend(f" {p}" for p in prop_lines[1:]) + + have_heading = False + for raw in markdown.replace("\r\n", "\n").split("\n"): + line = raw.strip() + if not line: + continue + if re.match(r"^#{1,6}\s+", line): + out.append(f"- {line}") + have_heading = True + elif have_heading: + out.append(f"\t- {line}") + else: + out.append(f"- {line}") + return "\n".join(out) diff --git a/skills/developing/mineru/scripts/sinks/airtable.py b/skills/developing/mineru/scripts/sinks/airtable.py new file mode 100644 index 0000000..638e8b1 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/airtable.py @@ -0,0 +1,50 @@ +"""Airtable sink — store parsed Markdown as a record in a base/table. + +Airtable is a database, not a document tool: the native ingestion path is a +record whose fields hold the title and the Markdown body. Field names are +configurable to match an existing table schema. + +Docs: https://airtable.com/developers/web/api/create-records +(POST /v0/{baseId}/{tableIdOrName}). +""" + +from __future__ import annotations + +import urllib.parse + +from . import _http +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + +API_BASE = "https://api.airtable.com/v0" + + +@register +class AirtableSink(Sink): + name = "airtable" + requires = ("AIRTABLE_API_KEY", "AIRTABLE_BASE_ID", "AIRTABLE_TABLE") + label = "Airtable record (database)" + + def deliver(self, doc: ParsedDoc) -> SinkResult: + api_key = self.env("AIRTABLE_API_KEY") + base = self.env("AIRTABLE_BASE_ID") + table = self.env("AIRTABLE_TABLE") + title_field = self.env("AIRTABLE_TITLE_FIELD", "Title") + body_field = self.env("AIRTABLE_BODY_FIELD", "Notes") + + url = f"{API_BASE}/{base}/{urllib.parse.quote(table)}" + headers = {"Authorization": f"Bearer {api_key}"} + payload = {"fields": {title_field: doc.title, body_field: doc.markdown}} + + status, parsed = _http.request_json("POST", url, headers=headers, payload=payload) + + if parsed.get("error") or status >= 400: + raise SinkError(str(parsed.get("error") or f"HTTP {status}")) + if not parsed.get("id"): + raise SinkError(f"Airtable returned no record id: {parsed}") + + return SinkResult( + sink=self.name, + ok=True, + url=None, + detail="stored as a database record (Airtable is a DB, not a doc)", + ) diff --git a/skills/developing/mineru/scripts/sinks/base.py b/skills/developing/mineru/scripts/sinks/base.py new file mode 100644 index 0000000..4530033 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/base.py @@ -0,0 +1,101 @@ +"""Core types and the sink registry for delivering parsed Markdown to content tools. + +A *sink* takes a :class:`ParsedDoc` (Markdown + local images + metadata) and +delivers it into one destination (Obsidian, Notion, Slack, Feishu, ...) using +that tool's OFFICIAL native ingestion path. Sinks read their configuration from +environment variables so an AI agent can run them without interactive prompts. +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class ParsedDoc: + """A parsed document ready for delivery.""" + + title: str + markdown: str + images: tuple = () # absolute paths to local image files + source: str = "" + modality: str = "unknown" + markdown_path: Optional[str] = None + + +@dataclass +class SinkResult: + """Outcome of delivering a :class:`ParsedDoc` to one sink.""" + + sink: str + ok: bool + url: Optional[str] = None + detail: Optional[str] = None + error: Optional[str] = None + + def to_status(self) -> dict: + return { + "sink": self.sink, + "ok": self.ok, + "url": self.url, + "detail": self.detail, + "error": self.error, + } + + +class SinkError(Exception): + """Raised by a sink when delivery fails for a known reason.""" + + +class Sink: + """Base class for a delivery target. + + Subclasses set ``name``/``aliases``/``requires`` and implement + :meth:`deliver`. ``requires`` lists the environment variables that must be + present for the sink to be usable. + """ + + name: str = "base" + aliases: tuple = () + requires: tuple = () # required env vars + label: str = "" # human description + local: bool = False # filesystem-only, no network/auth + + def env(self, key: str, default: Optional[str] = None) -> Optional[str]: + value = os.environ.get(key, default) + return value.strip() if isinstance(value, str) else value + + def missing_config(self) -> list: + return [k for k in self.requires if not self.env(k)] + + def is_configured(self) -> bool: + return not self.missing_config() + + def deliver(self, doc: ParsedDoc) -> SinkResult: # pragma: no cover - abstract + raise NotImplementedError + + +# --------------------------------------------------------------------------- # +# Registry +# --------------------------------------------------------------------------- # +REGISTRY: dict = {} + + +def register(cls): + """Class decorator that instantiates a sink and registers it by name+aliases.""" + inst = cls() + REGISTRY[inst.name] = inst + for alias in inst.aliases: + REGISTRY[alias] = inst + return cls + + +def get_sink(name: str) -> Optional[Sink]: + return REGISTRY.get(name.lower()) + + +def sink_names() -> list: + """Canonical sink names (no aliases), sorted.""" + return sorted({s.name for s in REGISTRY.values()}) diff --git a/skills/developing/mineru/scripts/sinks/coda.py b/skills/developing/mineru/scripts/sinks/coda.py new file mode 100644 index 0000000..c2dee5b --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/coda.py @@ -0,0 +1,72 @@ +"""Coda sink: deliver Markdown as a page, into an existing doc or a new one. + +Coda's API (``https://coda.io/apis/v1``) authenticates with a Bearer token. +Markdown is delivered as canvas page content. If ``CODA_DOC_ID`` is set, a new +page is added to that doc; otherwise a new doc is created with the content as its +initial page. + +Coda canvas content embeds images by URL only, so local image refs are left +untouched — host images at a public URL for them to render. +""" + +from __future__ import annotations + +from pathlib import Path + +from . import _http, _md +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + +API = "https://coda.io/apis/v1" + + +def _canvas(markdown: str) -> dict: + return {"type": "canvas", "canvasContent": {"format": "markdown", "content": markdown}} + + +@register +class CodaSink(Sink): + name = "coda" + requires = ("CODA_API_TOKEN",) + label = "Coda page (REST API)" + + def deliver(self, doc: ParsedDoc) -> SinkResult: + token = self.env("CODA_API_TOKEN") + doc_id = self.env("CODA_DOC_ID") + headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + } + + base_dir = Path(doc.markdown_path).parent if doc.markdown_path else None + n_images = len(_md.find_local_images(doc.markdown, base_dir)) + + if doc_id: + status, parsed = _http.request_json( + "POST", f"{API}/docs/{doc_id}/pages", headers=headers, payload={ + "name": doc.title, + "pageContent": _canvas(doc.markdown), + }, + ) + else: + status, parsed = _http.request_json( + "POST", f"{API}/docs", headers=headers, payload={ + "title": doc.title, + "initialPage": { + "name": doc.title, + "pageContent": _canvas(doc.markdown), + }, + }, + ) + + if status >= 400: + raise SinkError(parsed.get("message") or f"HTTP {status}") + + if n_images: + detail = f"text only ({n_images} local image(s); Coda embeds images by URL)" + else: + detail = "text only" + return SinkResult( + sink=self.name, ok=True, + url=parsed.get("browserLink"), + detail=detail, + ) diff --git a/skills/developing/mineru/scripts/sinks/confluence.py b/skills/developing/mineru/scripts/sinks/confluence.py new file mode 100644 index 0000000..5565478 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/confluence.py @@ -0,0 +1,66 @@ +"""Confluence sink: create a page from the parsed Markdown via the Cloud REST API. + +Confluence Cloud ingests content as *storage-format* HTML. Delivery converts the +Markdown to HTML and creates a page with the v2 REST API +(``POST /wiki/api/v2/pages``) using Basic auth (email + API token). + +Local images are not attached — Confluence storage HTML references attachments by +filename, which would require a separate upload step. +""" + +from __future__ import annotations + +import base64 + +from . import _http, _md +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + + +@register +class ConfluenceSink(Sink): + name = "confluence" + requires = ( + "CONFLUENCE_BASE_URL", + "CONFLUENCE_EMAIL", + "CONFLUENCE_API_TOKEN", + "CONFLUENCE_SPACE_ID", + ) + label = "Confluence Cloud page (storage HTML)" + + def deliver(self, doc: ParsedDoc) -> SinkResult: + base = self.env("CONFLUENCE_BASE_URL").rstrip("/") + email = self.env("CONFLUENCE_EMAIL") + token = self.env("CONFLUENCE_API_TOKEN") + space = self.env("CONFLUENCE_SPACE_ID") + + auth = base64.b64encode(f"{email}:{token}".encode("utf-8")).decode("ascii") + headers = { + "Authorization": f"Basic {auth}", + "Content-Type": "application/json", + } + + html = _md.md_to_html(doc.markdown) + status, parsed = _http.request_json( + "POST", + f"{base}/wiki/api/v2/pages", + headers=headers, + payload={ + "spaceId": space, + "status": "current", + "title": doc.title, + "body": {"representation": "storage", "value": html}, + }, + ) + if status >= 400: + raise SinkError( + parsed.get("title") + or parsed.get("message") + or f"Confluence HTTP {status}" + ) + + webui = (parsed.get("_links") or {}).get("webui") + url = base + webui if webui else None + return SinkResult( + sink=self.name, ok=True, url=url, + detail="converted Markdown->storage HTML (local images not attached)", + ) diff --git a/skills/developing/mineru/scripts/sinks/dingtalk.py b/skills/developing/mineru/scripts/sinks/dingtalk.py new file mode 100644 index 0000000..f669456 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/dingtalk.py @@ -0,0 +1,65 @@ +"""DingTalk (钉钉) sink — push parsed Markdown as a robot markdown message. + +A DingTalk custom robot accepts a ``markdown`` message type. The official native +ingestion path is therefore a webhook POST carrying the document title and body. +When a signing secret is configured the request is HMAC-SHA256 signed per +DingTalk's spec. DingTalk's markdown renderer only fetches images over public +URLs, so local images won't render. + +Docs: https://open.dingtalk.com/document/robots/custom-robot-access. +""" + +from __future__ import annotations + +import base64 +import hashlib +import hmac +import time +import urllib.parse + +from . import _http +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + + +@register +class DingTalkSink(Sink): + name = "dingtalk" + aliases = ("钉钉",) + requires = ("DINGTALK_WEBHOOK",) + label = "DingTalk robot markdown (钉钉)" + + def _build_url(self) -> str: + webhook = self.env("DINGTALK_WEBHOOK") + if webhook.startswith("http"): + url = webhook + else: + url = f"https://oapi.dingtalk.com/robot/send?access_token={webhook}" + + secret = self.env("DINGTALK_SECRET") + if secret: + timestamp = str(round(time.time() * 1000)) + string_to_sign = f"{timestamp}\n{secret}" + hmac_code = hmac.new( + secret.encode(), string_to_sign.encode(), hashlib.sha256 + ).digest() + sign = urllib.parse.quote_plus(base64.b64encode(hmac_code)) + url += f"×tamp={timestamp}&sign={sign}" + return url + + def deliver(self, doc: ParsedDoc) -> SinkResult: + url = self._build_url() + payload = { + "msgtype": "markdown", + "markdown": {"title": doc.title, "text": doc.markdown}, + } + status, parsed = _http.request_json("POST", url, payload=payload) + + if parsed.get("errcode") not in (0, None): + raise SinkError(parsed.get("errmsg") or f"DingTalk HTTP {status}: {parsed}") + + return SinkResult( + sink=self.name, + ok=True, + url=None, + detail="robot markdown message (local images won't render; host publicly)", + ) diff --git a/skills/developing/mineru/scripts/sinks/feishu.py b/skills/developing/mineru/scripts/sinks/feishu.py new file mode 100644 index 0000000..5c662fd --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/feishu.py @@ -0,0 +1,124 @@ +"""Feishu / Lark sink: import the parsed Markdown as a Docx document. + +Feishu (飞书) / Lark ingests Markdown through its Drive import pipeline. Delivery +follows that official path: + +1. ``tenant_access_token/internal`` — exchange the app id/secret for a tenant + access token. +2. ``drive/v1/medias/upload_all`` — upload the ``.md`` bytes as an import medium + and obtain a ``file_token``. +3. ``drive/v1/import_tasks`` — kick off an import task converting the medium to a + Docx, returning a ``ticket``. +4. Poll ``drive/v1/import_tasks/{ticket}`` until the job finishes, surfacing the + resulting document URL. + +Local images are not uploaded — they would need public URLs to render in Docx. +""" + +from __future__ import annotations + +import json +import time + +from . import _http, _md +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + + +@register +class FeishuSink(Sink): + name = "feishu" + aliases = ("lark", "飞书") + requires = ("FEISHU_APP_ID", "FEISHU_APP_SECRET") + label = "Feishu / Lark Docx (Drive import)" + + def deliver(self, doc: ParsedDoc) -> SinkResult: + app_id = self.env("FEISHU_APP_ID") + app_secret = self.env("FEISHU_APP_SECRET") + folder_token = self.env("FEISHU_FOLDER_TOKEN") + + # Step 1: tenant access token. + status, parsed = _http.request_json( + "POST", + "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal", + payload={"app_id": app_id, "app_secret": app_secret}, + ) + token = parsed.get("tenant_access_token") + if parsed.get("code") not in (0, None) or not token: + raise SinkError(parsed.get("msg") or f"Feishu auth failed (HTTP {status})") + headers = {"Authorization": f"Bearer {token}"} + + # Step 2: upload the Markdown bytes as an import medium. + content = doc.markdown.encode("utf-8") + fname = _md.safe_filename(doc.title) + ".md" + ctype, body = _http.encode_multipart( + fields={ + "file_name": fname, + "parent_type": "ccm_import_open", + "size": str(len(content)), + "extra": json.dumps({"obj_type": "docx", "file_extension": "md"}), + }, + files=[("file", fname, content)], + ) + up_status, raw = _http.http_request( + "POST", + "https://open.feishu.cn/open-apis/drive/v1/medias/upload_all", + headers={**headers, "Content-Type": ctype}, + data=body, + ) + parsed = _parse_json(raw) + if parsed.get("code") not in (0, None): + raise SinkError(parsed.get("msg") or f"Feishu media upload failed (HTTP {up_status})") + file_token = (parsed.get("data") or {}).get("file_token") + if not file_token: + raise SinkError("Feishu did not return a file_token") + + # Step 3: create the import task. + status, parsed = _http.request_json( + "POST", + "https://open.feishu.cn/open-apis/drive/v1/import_tasks", + headers=headers, + payload={ + "file_extension": "md", + "file_token": file_token, + "type": "docx", + "file_name": doc.title, + "point": {"mount_type": 1, "mount_key": folder_token or ""}, + }, + ) + if parsed.get("code") not in (0, None): + raise SinkError(parsed.get("msg") or f"Feishu import task failed (HTTP {status})") + ticket = (parsed.get("data") or {}).get("ticket") + if not ticket: + raise SinkError("Feishu did not return an import ticket") + + # Step 4: poll until the import job completes. + url = None + for _attempt in range(20): + status, parsed = _http.request_json( + "GET", + f"https://open.feishu.cn/open-apis/drive/v1/import_tasks/{ticket}", + headers=headers, + ) + res = (parsed.get("data") or {}).get("result") or {} + job_status = res.get("job_status") + if job_status == 0: + url = res.get("url") + break + if job_status in (1, 2): + time.sleep(1) + continue + raise SinkError(res.get("job_error_msg") or "Feishu import failed") + + return SinkResult( + sink=self.name, ok=True, url=url, + detail="imported to Feishu Docx (local images need public URLs)", + ) + + +def _parse_json(raw): + if not raw: + return {} + try: + return json.loads(raw.decode("utf-8")) + except (ValueError, UnicodeDecodeError): + return {} diff --git a/skills/developing/mineru/scripts/sinks/linear.py b/skills/developing/mineru/scripts/sinks/linear.py new file mode 100644 index 0000000..944849d --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/linear.py @@ -0,0 +1,75 @@ +"""Linear sink: create an issue from Markdown via the GraphQL API. + +Linear's API is GraphQL at ``https://api.linear.app/graphql`` and authenticates +with a raw API key in the ``Authorization`` header (no ``Bearer`` prefix). The +issue description is Markdown; Linear renders inline ``data:`` image URIs, so +local images are read and embedded as base64 data URIs before delivery. +""" + +from __future__ import annotations + +import base64 +from pathlib import Path + +from . import _http, _md +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + +API = "https://api.linear.app/graphql" + +_MUTATION = ( + "mutation IssueCreate($input: IssueCreateInput!)" + "{issueCreate(input:$input){success issue{id url identifier}}}" +) + +_MIME = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", +} + + +def _data_uri(path: Path) -> str: + mime = _MIME.get(path.suffix.lower(), "image/png") + b64 = base64.b64encode(path.read_bytes()).decode("ascii") + return f"data:{mime};base64,{b64}" + + +@register +class LinearSink(Sink): + name = "linear" + requires = ("LINEAR_API_KEY", "LINEAR_TEAM_ID") + label = "Linear issue (GraphQL API)" + + def deliver(self, doc: ParsedDoc) -> SinkResult: + key = self.env("LINEAR_API_KEY") + team = self.env("LINEAR_TEAM_ID") + headers = {"Authorization": key, "Content-Type": "application/json"} + + base_dir = Path(doc.markdown_path).parent if doc.markdown_path else None + images = _md.find_local_images(doc.markdown, base_dir) + mapping = {ref: _data_uri(path) for _alt, ref, path in images} + body = _md.rewrite_images(doc.markdown, mapping) + + status, parsed = _http.request_json("POST", API, headers=headers, payload={ + "query": _MUTATION, + "variables": {"input": { + "teamId": team, + "title": doc.title, + "description": body, + }}, + }) + if parsed.get("errors"): + raise SinkError(str(parsed["errors"])) + + result = ((parsed.get("data") or {}).get("issueCreate")) or {} + if not result.get("success"): + raise SinkError(f"Linear did not create the issue (HTTP {status})") + issue = result.get("issue") or {} + + return SinkResult( + sink=self.name, ok=True, + url=issue.get("url"), + detail=f"{len(mapping)} image(s) inlined", + ) diff --git a/skills/developing/mineru/scripts/sinks/local.py b/skills/developing/mineru/scripts/sinks/local.py new file mode 100644 index 0000000..5d82c07 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/local.py @@ -0,0 +1,105 @@ +"""Local-first sinks: Obsidian and Logseq (filesystem writes, no auth). + +Both tools are folders of Markdown files. The native ingestion is a filesystem +write following each tool's conventions: + +* Obsidian — a flat note with YAML frontmatter; images in a per-note assets + folder, referenced with relative Markdown embeds. +* Logseq — an outline (every line a ``- `` block) with ``key:: value`` page + properties on the first block; images in ``assets/`` referenced as + ``![](../assets/x.png)``. +""" + +from __future__ import annotations + +from pathlib import Path + +from . import _md +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + + +def _copy_images(doc: ParsedDoc, dest_dir: Path, ref_prefix: str) -> dict: + """Copy referenced local images into ``dest_dir``; return ``{old_ref: new_ref}``.""" + base = Path(doc.markdown_path).parent if doc.markdown_path else None + mapping = {} + images = _md.find_local_images(doc.markdown, base) + if images: + dest_dir.mkdir(parents=True, exist_ok=True) + for _alt, ref, path in images: + target = dest_dir / path.name + target.write_bytes(path.read_bytes()) + mapping[ref] = f"{ref_prefix}{path.name}" + return mapping + + +@register +class ObsidianSink(Sink): + name = "obsidian" + aliases = ("ob",) + requires = ("OBSIDIAN_VAULT",) + label = "Obsidian vault (local Markdown)" + local = True + + def deliver(self, doc: ParsedDoc) -> SinkResult: + vault = Path(self.env("OBSIDIAN_VAULT")).expanduser() + if not vault.is_dir(): + raise SinkError(f"Obsidian vault not found: {vault}") + subdir = self.env("OBSIDIAN_SUBDIR", "") or "" + note_dir = vault / subdir if subdir else vault + note_dir.mkdir(parents=True, exist_ok=True) + + stem = _md.safe_filename(doc.title) + assets = note_dir / f"{stem}.assets" + mapping = _copy_images(doc, assets, f"{stem}.assets/") + body = _md.rewrite_images(doc.markdown, mapping) + + front = _md.yaml_frontmatter({ + "title": doc.title, + "source": doc.source, + "modality": doc.modality, + "tags": ["mineru", "parsed"], + }) + note_path = note_dir / f"{stem}.md" + note_path.write_text(f"{front}\n\n{body}\n", encoding="utf-8") + return SinkResult(sink=self.name, ok=True, url=str(note_path), + detail=f"{len(mapping)} image(s)") + + +@register +class LogseqSink(Sink): + name = "logseq" + requires = ("LOGSEQ_GRAPH",) + label = "Logseq graph (local outline)" + local = True + + def deliver(self, doc: ParsedDoc) -> SinkResult: + graph = Path(self.env("LOGSEQ_GRAPH")).expanduser() + if not graph.is_dir(): + raise SinkError(f"Logseq graph not found: {graph}") + pages = graph / "pages" + assets = graph / "assets" + pages.mkdir(parents=True, exist_ok=True) + + stem = _md.safe_filename(doc.title) + # Namespace asset names by page slug to avoid collisions in the shared assets/. + prefix = _md.slugify(doc.title) + mapping = {} + base = Path(doc.markdown_path).parent if doc.markdown_path else None + images = _md.find_local_images(doc.markdown, base) + if images: + assets.mkdir(parents=True, exist_ok=True) + for _alt, ref, path in images: + new_name = f"{prefix}-{path.name}" + (assets / new_name).write_bytes(path.read_bytes()) + mapping[ref] = f"../assets/{new_name}" + body = _md.rewrite_images(doc.markdown, mapping) + + outline = _md.md_to_logseq(body, properties={ + "title": doc.title, + "source": doc.source, + "tags": "mineru, parsed", + }) + page_path = pages / f"{stem}.md" + page_path.write_text(outline + "\n", encoding="utf-8") + return SinkResult(sink=self.name, ok=True, url=str(page_path), + detail=f"{len(mapping)} image(s)") diff --git a/skills/developing/mineru/scripts/sinks/notion.py b/skills/developing/mineru/scripts/sinks/notion.py new file mode 100644 index 0000000..e283593 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/notion.py @@ -0,0 +1,130 @@ +"""Notion sink: create a page under a parent page from Markdown blocks. + +Notion's native ingestion is the block API: each Markdown line becomes a typed +block (heading, quote, code, list item, paragraph). A page is created with up to +100 children inline; any remainder is appended in 100-block chunks via the +``/blocks/{id}/children`` PATCH endpoint. + +Notion has no inline image-from-bytes path (images must be uploaded or hosted +separately), so local image refs are intentionally left untouched. +""" + +from __future__ import annotations + +from pathlib import Path + +from . import _http, _md +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + +API = "https://api.notion.com/v1" +MAX_BLOCKS = 100 +MAX_TEXT = 2000 + + +def _rich(text: str) -> list: + return [{"type": "text", "text": {"content": text[:MAX_TEXT]}}] + + +def _block(block_type: str, text: str, **extra) -> dict: + inner = {"rich_text": _rich(text)} + inner.update(extra) + return {"object": "block", "type": block_type, block_type: inner} + + +def _is_numbered(text: str) -> bool: + head = text.split(".", 1) + return len(head) == 2 and head[0].isdigit() and head[1].startswith(" ") + + +def _blocks(markdown: str) -> list: + """Convert flat Markdown lines into a list of Notion block dicts.""" + blocks = [] + in_code = False + code_buf: list = [] + for raw in markdown.replace("\r\n", "\n").split("\n"): + stripped = raw.strip() + + if stripped.startswith("```"): + if in_code: + blocks.append(_block("code", "\n".join(code_buf), language="plain text")) + in_code = False + code_buf = [] + else: + in_code = True + code_buf = [] + continue + if in_code: + code_buf.append(raw) + continue + + if not stripped: + continue + if stripped.startswith("# "): + blocks.append(_block("heading_1", stripped[2:].strip())) + elif stripped.startswith("## "): + blocks.append(_block("heading_2", stripped[3:].strip())) + elif stripped.startswith("### "): + blocks.append(_block("heading_3", stripped[4:].strip())) + elif stripped.startswith("> "): + blocks.append(_block("quote", stripped[2:].strip())) + elif stripped.startswith("- ") or stripped.startswith("* "): + blocks.append(_block("bulleted_list_item", stripped[2:].strip())) + elif _is_numbered(stripped): + blocks.append(_block("numbered_list_item", stripped.split(".", 1)[1].strip())) + else: + blocks.append(_block("paragraph", stripped)) + + if in_code: + blocks.append(_block("code", "\n".join(code_buf), language="plain text")) + return blocks + + +@register +class NotionSink(Sink): + name = "notion" + requires = ("NOTION_API_KEY", "NOTION_PARENT_PAGE_ID") + label = "Notion page (blocks API)" + + def deliver(self, doc: ParsedDoc) -> SinkResult: + key = self.env("NOTION_API_KEY") + parent = self.env("NOTION_PARENT_PAGE_ID") + version = self.env("NOTION_VERSION", "2022-06-28") or "2022-06-28" + headers = { + "Authorization": f"Bearer {key}", + "Notion-Version": version, + "Content-Type": "application/json", + } + + # Count local images for the detail note (refs are left as-is). + base_dir = Path(doc.markdown_path).parent if doc.markdown_path else None + n_images = len(_md.find_local_images(doc.markdown, base_dir)) + + blocks = _blocks(doc.markdown) + status, parsed = _http.request_json("POST", f"{API}/pages", headers=headers, payload={ + "parent": {"page_id": parent}, + "properties": {"title": {"title": [{"text": {"content": doc.title}}]}}, + "children": blocks[:MAX_BLOCKS], + }) + if parsed.get("object") == "error": + raise SinkError(parsed.get("message") or f"Notion API error (HTTP {status})") + created_id = parsed.get("id") + if not created_id: + raise SinkError(f"Notion did not return a page id (HTTP {status})") + page_url = parsed.get("url") + + for start in range(MAX_BLOCKS, len(blocks), MAX_BLOCKS): + chunk = blocks[start:start + MAX_BLOCKS] + ch_status, ch_parsed = _http.request_json( + "PATCH", f"{API}/blocks/{created_id}/children", + headers=headers, payload={"children": chunk}, + ) + if ch_parsed.get("object") == "error": + raise SinkError(ch_parsed.get("message") + or f"Notion block append failed (HTTP {ch_status})") + + if n_images: + detail = (f"text+structure ({n_images} local images not embedded; " + f"Notion needs file upload)") + else: + detail = "text+structure" + return SinkResult(sink=self.name, ok=True, url=page_url, detail=detail) diff --git a/skills/developing/mineru/scripts/sinks/onenote.py b/skills/developing/mineru/scripts/sinks/onenote.py new file mode 100644 index 0000000..b14df96 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/onenote.py @@ -0,0 +1,66 @@ +"""OneNote sink: create a page from the parsed Markdown via Microsoft Graph. + +OneNote pages are created by POSTing an HTML document to a section's ``pages`` +endpoint with a pre-obtained Microsoft Graph access token (OAuth). Delivery +converts the Markdown to a full HTML document and creates the page. + +Only remote images render — Graph fetches ```` URLs, so local image +paths emitted by MinerU would need to be public URLs. +""" + +from __future__ import annotations + +import html +import json + +from . import _http, _md +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + + +@register +class OneNoteSink(Sink): + name = "onenote" + aliases = ("msonenote",) + requires = ("ONENOTE_TOKEN", "ONENOTE_SECTION_ID") + label = "OneNote section page (Microsoft Graph)" + + def deliver(self, doc: ParsedDoc) -> SinkResult: + token = self.env("ONENOTE_TOKEN") + section = self.env("ONENOTE_SECTION_ID") + + body_html = _md.md_to_html(doc.markdown) + page = ( + "" + f"{html.escape(doc.title)}" + f"{body_html}" + ) + + status, raw = _http.http_request( + "POST", + f"https://graph.microsoft.com/v1.0/me/onenote/sections/{section}/pages", + headers={ + "Authorization": f"Bearer {token}", + "Content-Type": "text/html", + }, + data=page.encode("utf-8"), + ) + if status >= 400: + preview = raw.decode("utf-8", "replace") if raw else "" + raise SinkError(f"OneNote HTTP {status}: {preview[:200]}") + if status != 201: + raise SinkError(f"OneNote unexpected response (HTTP {status})") + + parsed = {} + if raw: + try: + parsed = json.loads(raw.decode("utf-8")) + except (ValueError, UnicodeDecodeError): + parsed = {} + links = parsed.get("links") or {} + web = links.get("oneNoteWebUrl") or {} + url = web.get("href") + + return SinkResult( + sink=self.name, ok=True, url=url, + detail="converted Markdown->HTML (remote images only; OAuth token required)", + ) diff --git a/skills/developing/mineru/scripts/sinks/roam.py b/skills/developing/mineru/scripts/sinks/roam.py new file mode 100644 index 0000000..dcc991e --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/roam.py @@ -0,0 +1,106 @@ +"""Roam Research sink — optional dependency. + +There is no library that ingests a Markdown document into Roam, but the official +``roam-client`` SDK correctly handles the parts that are easy to get wrong — the +307/308 peer-host redirect, the dual ``Authorization`` / ``x-authorization`` +Bearer headers, and the ``/write`` plumbing. So we lazily depend on it for +transport and only build the Markdown → block-tree ourselves, delivering the whole +document in a single ``batch-actions`` request (one HTTP round-trip). + +Install the SDK (git-only, not on PyPI; needs Python ≥ 3.11): + + pip install "roam-client @ git+https://github.com/Roam-Research/backend-sdks.git#subdirectory=python" + +Config: ``ROAM_API_TOKEN`` (graph edit token), ``ROAM_GRAPH_NAME``. +""" + +from __future__ import annotations + +import itertools +import re + +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + +_HEADING = re.compile(r"^(#{1,6})\s+(.*)$") +_INSTALL_HINT = ( + 'Roam sink needs the official SDK — pip install ' + '"roam-client @ git+https://github.com/Roam-Research/backend-sdks.git#subdirectory=python"' +) + + +def md_to_roam_tree(markdown: str) -> list: + """Convert Markdown into a nested Roam block tree. + + Headings become parent blocks (``heading`` 1–3); the lines under a heading + nest beneath it. Returns ``[{"string", "heading"?, "children": [...]}, ...]``. + """ + roots: list = [] + stack: list = [] # [(heading_level, node)] + for raw in markdown.replace("\r\n", "\n").split("\n"): + line = raw.strip() + if not line: + continue + match = _HEADING.match(line) + if match: + level = len(match.group(1)) + node = {"string": match.group(2), "heading": min(level, 3), "children": []} + while stack and stack[-1][0] >= level: + stack.pop() + (stack[-1][1]["children"] if stack else roots).append(node) + stack.append((level, node)) + else: + node = {"string": line, "children": []} + (stack[-1][1]["children"] if stack else roots).append(node) + return roots + + +def tree_to_actions(children: list, parent_uid: str, uidgen) -> list: + """Flatten a block tree into ``create-block`` actions for one batch request.""" + actions: list = [] + for order, node in enumerate(children): + uid = uidgen() + block = {"string": node["string"], "uid": uid} + if node.get("heading"): + block["heading"] = node["heading"] + actions.append({ + "action": "create-block", + "location": {"parent-uid": parent_uid, "order": order}, + "block": block, + }) + actions.extend(tree_to_actions(node.get("children", []), uid, uidgen)) + return actions + + +@register +class RoamSink(Sink): + name = "roam" + aliases = ("roamresearch",) + requires = ("ROAM_API_TOKEN", "ROAM_GRAPH_NAME") + label = "Roam Research (batch-actions, optional dep)" + + def deliver(self, doc: ParsedDoc) -> SinkResult: + try: + from roam_client.client import create_page, initialize_graph + except ImportError as exc: # pragma: no cover - exercised via SinkError path + raise SinkError(_INSTALL_HINT) from exc + + token = self.env("ROAM_API_TOKEN") + graph = self.env("ROAM_GRAPH_NAME") + client = initialize_graph({"token": token, "graph": graph}) + + create_page(client, {"page": {"title": doc.title}}) + + counter = itertools.count(1) + actions = tree_to_actions( + md_to_roam_tree(doc.markdown), doc.title, lambda: f"mu{next(counter):07d}" + ) + if actions: + client.call( + f"/api/graph/{graph}/write", "POST", + {"action": "batch-actions", "actions": actions}, + ) + return SinkResult( + sink=self.name, ok=True, + url=f"https://roamresearch.com/#/app/{graph}", + detail=f"{len(actions)} block(s) via batch-actions (images need public URLs)", + ) diff --git a/skills/developing/mineru/scripts/sinks/siyuan.py b/skills/developing/mineru/scripts/sinks/siyuan.py new file mode 100644 index 0000000..5532606 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/siyuan.py @@ -0,0 +1,111 @@ +"""SiYuan sink: create a new document from Markdown via the local kernel API. + +SiYuan (思源笔记) exposes a kernel HTTP API (default ``http://127.0.0.1:6806``) +authenticated with an API token. Delivery follows SiYuan's native ingestion path: + +1. Resolve the target notebook (``SIYUAN_NOTEBOOK`` or the first listed notebook). +2. Upload each referenced local image via ``/api/asset/upload`` and rewrite the + Markdown to point at the returned ``assets/...`` paths. +3. Create the document with ``/api/filetree/createDocWithMd``. + +Every kernel response wraps its payload as ``{"code": 0, "msg": "", "data": ...}``; +a non-zero ``code`` is an error. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from . import _http, _md +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + + +@register +class SiYuanSink(Sink): + name = "siyuan" + requires = ("SIYUAN_TOKEN",) + label = "SiYuan notebook (local kernel API)" + + def _json_post(self, base: str, path: str, headers: dict, payload: dict): + """POST JSON; return ``data`` after verifying ``code == 0``.""" + try: + status, parsed = _http.request_json("POST", f"{base}{path}", + headers=headers, payload=payload) + except Exception as exc: # noqa: BLE001 + raise self._unreachable(base, exc) from exc + return self._unwrap(base, status, parsed) + + def _upload_post(self, base: str, headers: dict, content_type: str, body: bytes): + """POST a multipart body; return ``data`` after verifying ``code == 0``.""" + hdrs = dict(headers) + hdrs["Content-Type"] = content_type + try: + status, raw = _http.http_request("POST", f"{base}/api/asset/upload", + headers=hdrs, data=body) + except Exception as exc: # noqa: BLE001 + raise self._unreachable(base, exc) from exc + parsed: dict = {} + if raw: + try: + parsed = json.loads(raw.decode("utf-8")) + except (ValueError, UnicodeDecodeError): + parsed = {} + return self._unwrap(base, status, parsed) + + @staticmethod + def _unreachable(base: str, exc=None) -> SinkError: + suffix = f" ({exc})" if exc else "" + return SinkError( + f"SiYuan kernel not reachable at {base} — start SiYuan and enable " + f"the API token{suffix}" + ) + + def _unwrap(self, base: str, status: int, parsed: dict): + if status == 0: + raise self._unreachable(base) + if parsed.get("code") != 0: + raise SinkError(parsed.get("msg") or f"SiYuan API error (HTTP {status})") + return parsed.get("data") + + def deliver(self, doc: ParsedDoc) -> SinkResult: + base = (self.env("SIYUAN_API_URL", "http://127.0.0.1:6806") + or "http://127.0.0.1:6806").rstrip("/") + token = self.env("SIYUAN_TOKEN") + headers = {"Authorization": f"Token {token}"} + + notebook = self.env("SIYUAN_NOTEBOOK") + if not notebook: + data = self._json_post(base, "/api/notebook/lsNotebooks", headers, {}) + notebooks = (data or {}).get("notebooks") or [] + if not notebooks: + raise SinkError("SiYuan has no notebooks — create one before delivering") + notebook = notebooks[0]["id"] + + base_dir = Path(doc.markdown_path).parent if doc.markdown_path else None + images = _md.find_local_images(doc.markdown, base_dir) + mapping = {} + for _alt, ref, path in images: + content_type, body = _http.encode_multipart( + fields={"assetsDirPath": "/assets/"}, + files=[("file[]", path.name, path.read_bytes())], + ) + data = self._upload_post(base, headers, content_type, body) + succ_map = (data or {}).get("succMap") or {} + if path.name in succ_map: + mapping[ref] = succ_map[path.name] + body_md = _md.rewrite_images(doc.markdown, mapping) + + docid = self._json_post(base, "/api/filetree/createDocWithMd", headers, { + "notebook": notebook, + "path": "/" + _md.safe_filename(doc.title), + "markdown": body_md, + }) + if not docid: + raise SinkError("SiYuan did not return a document id") + + return SinkResult( + sink=self.name, ok=True, + url=f"siyuan://blocks/{docid}", + detail=f"{len(mapping)} image(s)", + ) diff --git a/skills/developing/mineru/scripts/sinks/slack.py b/skills/developing/mineru/scripts/sinks/slack.py new file mode 100644 index 0000000..5c89d83 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/slack.py @@ -0,0 +1,95 @@ +"""Slack sink: upload the parsed Markdown as a file via the external-upload flow. + +Slack deprecated ``files.upload`` (retired) in favour of a three-step external +upload. Delivery follows that official path: + +1. ``files.getUploadURLExternal`` — reserve an upload URL + file id for the + given filename and byte length. +2. ``POST`` the raw bytes to the returned upload URL. +3. ``files.completeUploadExternal`` — finalize the upload, attach it to the + target channel, and post an initial comment. + +Images are *not* embedded: Markdown is uploaded as a single ``.md`` file. +""" + +from __future__ import annotations + +import urllib.parse + +from . import _http, _md +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + + +@register +class SlackSink(Sink): + name = "slack" + requires = ("SLACK_BOT_TOKEN", "SLACK_CHANNEL") + label = "Slack channel (file upload)" + + def deliver(self, doc: ParsedDoc) -> SinkResult: + token = self.env("SLACK_BOT_TOKEN") + channel = self.env("SLACK_CHANNEL") + auth = {"Authorization": f"Bearer {token}"} + + content = doc.markdown.encode("utf-8") + filename = _md.slugify(doc.title) + ".md" + + # Step 1: reserve an external upload URL + file id. This endpoint wants + # form-encoded data, so use http_request and parse the JSON response. + form = urllib.parse.urlencode({ + "filename": filename, + "length": len(content), + }).encode("utf-8") + status, raw = _http.http_request( + "POST", + "https://slack.com/api/files.getUploadURLExternal", + headers={**auth, "Content-Type": "application/x-www-form-urlencoded"}, + data=form, + ) + parsed = _parse_json(raw) + if not parsed.get("ok"): + raise SinkError(parsed.get("error") or f"Slack getUploadURLExternal failed (HTTP {status})") + upload_url = parsed.get("upload_url") + file_id = parsed.get("file_id") + if not upload_url or not file_id: + raise SinkError("Slack did not return an upload URL / file id") + + # Step 2: upload the raw bytes to the reserved URL. + up_status, _up_body = _http.http_request( + "POST", upload_url, + headers={"Content-Type": "application/octet-stream"}, + data=content, + ) + if up_status != 200: + raise SinkError(f"Slack file upload failed (HTTP {up_status})") + + # Step 3: finalize the upload into the channel. + status, parsed = _http.request_json( + "POST", + "https://slack.com/api/files.completeUploadExternal", + headers=auth, + payload={ + "files": [{"id": file_id, "title": doc.title}], + "channel_id": channel, + "initial_comment": f"Parsed: {doc.title}", + }, + ) + if not parsed.get("ok"): + raise SinkError(parsed.get("error") or f"Slack completeUploadExternal failed (HTTP {status})") + + files = parsed.get("files") or [{}] + url = files[0].get("permalink") + return SinkResult( + sink=self.name, ok=True, url=url, + detail="uploaded .md file (images not embedded)", + ) + + +def _parse_json(raw): + import json + if not raw: + return {} + try: + return json.loads(raw.decode("utf-8")) + except (ValueError, UnicodeDecodeError): + return {} diff --git a/skills/developing/mineru/scripts/sinks/ticktick.py b/skills/developing/mineru/scripts/sinks/ticktick.py new file mode 100644 index 0000000..7d802e7 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/ticktick.py @@ -0,0 +1,48 @@ +"""TickTick (滴答清单) sink — create a task from parsed Markdown. + +TickTick's Open API exposes a task object whose ``content`` field holds the body +text. The official native ingestion path for arbitrary Markdown is therefore a +task: the document title becomes the task title and the Markdown becomes the +task content. Tasks have no attachment/inline-image surface, so local images are +not delivered. + +Docs: https://developer.ticktick.com/docs (POST /open/v1/task). +""" + +from __future__ import annotations + +from . import _http +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + +API_URL = "https://api.ticktick.com/open/v1/task" + + +@register +class TickTickSink(Sink): + name = "ticktick" + aliases = ("dida", "滴答清单") + requires = ("TICKTICK_TOKEN",) + label = "TickTick task (滴答清单)" + + def deliver(self, doc: ParsedDoc) -> SinkResult: + token = self.env("TICKTICK_TOKEN") + project_id = self.env("TICKTICK_PROJECT_ID") + + payload = {"title": doc.title, "content": doc.markdown} + if project_id: + payload["projectId"] = project_id + + headers = {"Authorization": f"Bearer {token}"} + status, parsed = _http.request_json("POST", API_URL, headers=headers, payload=payload) + + if status >= 400: + raise SinkError(f"TickTick HTTP {status}: {parsed}") + if not parsed.get("id"): + raise SinkError(f"TickTick returned no task id: {parsed}") + + return SinkResult( + sink=self.name, + ok=True, + url=None, + detail="task content (no inline images supported by TickTick)", + ) diff --git a/skills/developing/mineru/scripts/sinks/wecom.py b/skills/developing/mineru/scripts/sinks/wecom.py new file mode 100644 index 0000000..16fcea1 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/wecom.py @@ -0,0 +1,60 @@ +"""WeCom (企业微信 / WeChat Work) sink — send parsed Markdown as an app message. + +WeCom apps deliver content via the message-send API. The native ingestion path +is a ``markdown`` message from a self-built app: first an access token is fetched +with the corp id + secret, then the message is posted. WeCom's markdown is a +limited subset with a 2048-byte content cap and no inline images, so the body is +truncated to fit. + +Docs: https://developer.work.weixin.qq.com/document/path/90236 (message/send), +https://developer.work.weixin.qq.com/document/path/91039 (gettoken). +""" + +from __future__ import annotations + +from . import _http +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + +TOKEN_URL = "https://qyapi.weixin.qq.com/cgi-bin/gettoken" +SEND_URL = "https://qyapi.weixin.qq.com/cgi-bin/message/send" + + +@register +class WeComSink(Sink): + name = "wecom" + aliases = ("企业微信", "wechatwork") + requires = ("WECOM_CORPID", "WECOM_CORPSECRET", "WECOM_AGENTID") + label = "WeCom app markdown (企业微信)" + + def deliver(self, doc: ParsedDoc) -> SinkResult: + corpid = self.env("WECOM_CORPID") + secret = self.env("WECOM_CORPSECRET") + agentid = self.env("WECOM_AGENTID") + touser = self.env("WECOM_TOUSER", "@all") + + # Step 1: fetch an access token. + token_url = f"{TOKEN_URL}?corpid={corpid}&corpsecret={secret}" + status, parsed = _http.request_json("GET", token_url) + if parsed.get("errcode") not in (0, None) or not parsed.get("access_token"): + raise SinkError(parsed.get("errmsg") or f"WeCom token fetch failed: {parsed}") + token = parsed["access_token"] + + # Step 2: send the markdown message. + send_url = f"{SEND_URL}?access_token={token}" + payload = { + "touser": touser, + "msgtype": "markdown", + "agentid": int(agentid), + "markdown": {"content": doc.markdown[:2048]}, + } + status, parsed = _http.request_json("POST", send_url, payload=payload) + if parsed.get("errcode") not in (0, None): + raise SinkError(parsed.get("errmsg") or f"WeCom send failed: {parsed}") + + return SinkResult( + sink=self.name, + ok=True, + url=None, + detail="markdown notification (WeCom markdown is a limited subset, " + "2048-byte cap, no inline images)", + ) diff --git a/skills/developing/mineru/scripts/sinks/wps.py b/skills/developing/mineru/scripts/sinks/wps.py new file mode 100644 index 0000000..95c6828 --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/wps.py @@ -0,0 +1,104 @@ +"""WPS / 金山文档 (Kingsoft kdocs) sink — optional dependency. + +The native ingestion path is: Markdown → ``.docx`` → upload to the kdocs cloud +appspace. There is no official Python SDK, so: + +* Markdown→DOCX uses the maintained, pure-pip ``html-for-docx`` package + (reusing this project's Markdown→HTML), lazily imported so the core stays + zero-dependency. Install with ``pip install mineru-skill[wps]``. +* The kdocs WPS-2 request signing (plain SHA-1) and multipart upload are done + with the standard library — small and fully documented. + +Cloud upload requires an approved kdocs developer app (``WPS_APP_ID`` / +``WPS_APP_SECRET``) and a provisioned appspace; it is opt-in and surfaces the +raw kdocs error on failure. Docs: https://developer.kdocs.cn/server/guide/signature.html +""" + +from __future__ import annotations + +import email.utils +import hashlib +import io +import json + +from . import _http, _md +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + +KDOCS_UPLOAD = "https://developer.kdocs.cn/api/v1/openapi/appspace/files/upload" + + +def _markdown_to_docx_bytes(markdown: str) -> bytes: + """Convert Markdown → HTML → DOCX bytes via the optional html-for-docx lib.""" + try: + from html4docx import HtmlToDocx # pip install html-for-docx + except ImportError as exc: # pragma: no cover - exercised via SinkError path + raise SinkError( + "WPS sink needs a Markdown→DOCX converter — " + "pip install 'mineru-skill[wps]' (i.e. pip install html-for-docx)" + ) from exc + html = _md.md_to_html(markdown) + document = HtmlToDocx().parse_html_string(html) + buf = io.BytesIO() + document.save(buf) + return buf.getvalue() + + +def _wps2_headers(app_id: str, app_secret: str, body: bytes, content_type: str) -> dict: + """Build kdocs WPS-2 auth headers. + + signature = sha1(app_secret + content_md5 + content_type + date) hex. + Content-Md5 / Content-Type must match the exact wire body and header sent. + """ + content_md5 = hashlib.md5(body).hexdigest() + date = email.utils.formatdate(usegmt=True) # RFC1123 GMT + signature = hashlib.sha1( + (app_secret + content_md5 + content_type + date).encode("utf-8") + ).hexdigest() + return { + "Date": date, + "Content-Md5": content_md5, + "Content-Type": content_type, + "Authorization": f"WPS-2:{app_id}:{signature}", + } + + +@register +class WpsSink(Sink): + name = "wps" + aliases = ("kdocs", "金山文档", "金山") + requires = ("WPS_APP_ID", "WPS_APP_SECRET") + label = "WPS / 金山文档 (Markdown→DOCX upload, optional dep)" + + def deliver(self, doc: ParsedDoc) -> SinkResult: + app_id = self.env("WPS_APP_ID") + app_secret = self.env("WPS_APP_SECRET") + + docx_bytes = _markdown_to_docx_bytes(doc.markdown) + filename = _md.safe_filename(doc.title) + ".docx" + + fields = {} + parent_path = self.env("WPS_PARENT_PATH") + parent_token = self.env("WPS_PARENT_TOKEN") + if parent_path: + fields["parent_path"] = parent_path + if parent_token: + fields["parent_token"] = parent_token + + content_type, body = _http.encode_multipart( + fields=fields, files=[("file", filename, docx_bytes)] + ) + headers = _wps2_headers(app_id, app_secret, body, content_type) + + status, raw = _http.http_request("POST", KDOCS_UPLOAD, headers=headers, data=body) + try: + parsed = json.loads(raw.decode("utf-8")) if raw else {} + except (ValueError, UnicodeDecodeError): + parsed = {} + if status >= 400 or parsed.get("code") not in (0, None): + raise SinkError(parsed.get("message") or parsed.get("msg") or f"kdocs HTTP {status}") + + file_token = (parsed.get("data") or {}).get("file_token") + return SinkResult( + sink=self.name, ok=True, url=file_token, + detail="Markdown→DOCX uploaded to 金山文档 (experimental; needs a provisioned appspace)", + ) diff --git a/skills/developing/mineru/scripts/sinks/yuque.py b/skills/developing/mineru/scripts/sinks/yuque.py new file mode 100644 index 0000000..d0bc65a --- /dev/null +++ b/skills/developing/mineru/scripts/sinks/yuque.py @@ -0,0 +1,65 @@ +"""Yuque (语雀) sink: create a Markdown doc in a repository via the open API. + +Yuque's open API (``https://www.yuque.com/api/v2``) authenticates with an +``X-Auth-Token`` header and creates docs under a repository namespace. The body +is posted as raw Markdown. + +Yuque's open API has no asset-upload endpoint, so local image refs are left +untouched — host images at a public URL for them to render. +""" + +from __future__ import annotations + +from pathlib import Path + +from . import _http, _md +from .base import ParsedDoc, Sink, SinkError, SinkResult, register + +API = "https://www.yuque.com/api/v2" + + +@register +class YuqueSink(Sink): + name = "yuque" + aliases = ("语雀",) + requires = ("YUQUE_TOKEN", "YUQUE_NAMESPACE") + label = "Yuque doc (open API)" + + def deliver(self, doc: ParsedDoc) -> SinkResult: + token = self.env("YUQUE_TOKEN") + namespace = self.env("YUQUE_NAMESPACE") + headers = { + "X-Auth-Token": token, + "User-Agent": "MinerU-Skill/3.0", + "Content-Type": "application/json", + } + + base_dir = Path(doc.markdown_path).parent if doc.markdown_path else None + n_images = len(_md.find_local_images(doc.markdown, base_dir)) + + status, parsed = _http.request_json( + "POST", f"{API}/repos/{namespace}/docs", headers=headers, payload={ + "title": doc.title, + "slug": _md.slugify(doc.title), + "public": 0, + "format": "markdown", + "body": doc.markdown, + }, + ) + + data = parsed.get("data") + if not data: + if status >= 400 or parsed.get("message"): + raise SinkError(parsed.get("message") or f"HTTP {status}") + raise SinkError(f"Yuque returned no doc data (HTTP {status})") + + slug = data.get("slug") + if n_images: + detail = f"text only ({n_images} local image(s); host images publicly to embed)" + else: + detail = "text only" + return SinkResult( + sink=self.name, ok=True, + url=f"https://www.yuque.com/{namespace}/{slug}", + detail=detail, + ) diff --git a/skills/developing/mineru/scripts/splitter.py b/skills/developing/mineru/scripts/splitter.py new file mode 100644 index 0000000..e2e9efb --- /dev/null +++ b/skills/developing/mineru/scripts/splitter.py @@ -0,0 +1,64 @@ +"""Split oversized PDFs into cap-sized parts so they clear the MinerU API limits. + +The MinerU cloud caps at 20 pages (free Agent API) / 200 pages (Standard API). +``--split`` slices a larger PDF into parts locally, each is parsed, and the +Markdown is merged back — so we are no longer bound by those page caps (the same +trick mineru-converter uses). Uses the optional ``pypdf`` library, lazily +imported, so the core stays zero-dependency. + + pip install "mineru-skill[split]" # i.e. pip install pypdf +""" + +from __future__ import annotations + +from pathlib import Path + + +class SplitError(Exception): + """Raised when splitting is requested but cannot be performed.""" + + +def _load_pypdf(): + try: + import pypdf # noqa: F401 + return pypdf + except ImportError as exc: + raise SplitError( + "--split needs the pypdf library — pip install 'mineru-skill[split]' " + "(i.e. pip install pypdf)" + ) from exc + + +def pdf_page_count(path) -> int: + """Return the page count of a local PDF (requires pypdf).""" + pypdf = _load_pypdf() + return len(pypdf.PdfReader(str(path)).pages) + + +def split_pdf(path, max_pages: int, out_dir) -> list: + """Slice ``path`` into ``max_pages``-page parts under ``out_dir``. + + Returns the list of part paths (a single-element list pointing at the original + file if it already fits). + """ + if max_pages < 1: + raise SplitError("max_pages must be >= 1") + pypdf = _load_pypdf() + reader = pypdf.PdfReader(str(path)) + total = len(reader.pages) + if total <= max_pages: + return [Path(path)] + + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + stem = Path(path).stem + parts = [] + for part_index, start in enumerate(range(0, total, max_pages), start=1): + writer = pypdf.PdfWriter() + for page in range(start, min(start + max_pages, total)): + writer.add_page(reader.pages[page]) + part_path = out_dir / f"{stem}__part{part_index:03d}.pdf" + with open(part_path, "wb") as handle: + writer.write(handle) + parts.append(part_path) + return parts From ecf332add5e0e0aa59d0cc3c78aba1b4c885cdff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Fri, 5 Jun 2026 14:49:54 +0800 Subject: [PATCH 4/5] modify ag_retrieve --- .../onprem/rag-retrieve/rag_retrieve_server.py | 14 ++++++++++++-- .../support/rag-retrieve/rag_retrieve_server.py | 14 ++++++++++++-- .../rag_retrieve_server.py | 7 ++++++- .../rag-retrieve-only/rag_retrieve_server.py | 7 ++++++- .../rag-retrieve-only/rag_retrieve_server.py | 7 ++++++- 5 files changed, 42 insertions(+), 7 deletions(-) diff --git a/skills/autoload/onprem/rag-retrieve/rag_retrieve_server.py b/skills/autoload/onprem/rag-retrieve/rag_retrieve_server.py index 6f308e2..a0cfd2d 100644 --- a/skills/autoload/onprem/rag-retrieve/rag_retrieve_server.py +++ b/skills/autoload/onprem/rag-retrieve/rag_retrieve_server.py @@ -314,7 +314,12 @@ async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]: top_k = arguments.get("top_k", 100) if not query: - return create_error_response(request_id, -32602, "Missing required parameter: query") + return create_success_response(request_id, { + "content": [{ + "type": "text", + "text": "Error: missing required parameter 'query'. Please call this tool again with a non-empty 'query' argument describing what you want to retrieve." + }] + }) result = rag_retrieve(query, top_k, trace_id) @@ -328,7 +333,12 @@ async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]: query = arguments.get("query", "") if not query: - return create_error_response(request_id, -32602, "Missing required parameter: query") + return create_success_response(request_id, { + "content": [{ + "type": "text", + "text": "Error: missing required parameter 'query'. Please call this tool again with a non-empty 'query' argument describing what you want to retrieve." + }] + }) result = table_rag_retrieve(query, trace_id) diff --git a/skills/autoload/support/rag-retrieve/rag_retrieve_server.py b/skills/autoload/support/rag-retrieve/rag_retrieve_server.py index 09e0924..bddaa3e 100644 --- a/skills/autoload/support/rag-retrieve/rag_retrieve_server.py +++ b/skills/autoload/support/rag-retrieve/rag_retrieve_server.py @@ -314,7 +314,12 @@ async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]: top_k = arguments.get("top_k", 100) if not query: - return create_error_response(request_id, -32602, "Missing required parameter: query") + return create_success_response(request_id, { + "content": [{ + "type": "text", + "text": "Error: missing required parameter 'query'. Please call this tool again with a non-empty 'query' argument describing what you want to retrieve." + }] + }) result = rag_retrieve(query, top_k, trace_id) @@ -328,7 +333,12 @@ async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]: query = arguments.get("query", "") if not query: - return create_error_response(request_id, -32602, "Missing required parameter: query") + return create_success_response(request_id, { + "content": [{ + "type": "text", + "text": "Error: missing required parameter 'query'. Please call this tool again with a non-empty 'query' argument describing what you want to retrieve." + }] + }) result = table_rag_retrieve(query, trace_id) diff --git a/skills/developing/rag-retrieve-no-citation/rag_retrieve_server.py b/skills/developing/rag-retrieve-no-citation/rag_retrieve_server.py index 99a3894..b7aa27c 100644 --- a/skills/developing/rag-retrieve-no-citation/rag_retrieve_server.py +++ b/skills/developing/rag-retrieve-no-citation/rag_retrieve_server.py @@ -167,7 +167,12 @@ async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]: top_k = arguments.get("top_k", 100) if not query: - return create_error_response(request_id, -32602, "Missing required parameter: query") + return create_success_response(request_id, { + "content": [{ + "type": "text", + "text": "Error: missing required parameter 'query'. Please call this tool again with a non-empty 'query' argument describing what you want to retrieve." + }] + }) result = rag_retrieve(query, top_k) diff --git a/skills/onprem/rag-retrieve-only/rag_retrieve_server.py b/skills/onprem/rag-retrieve-only/rag_retrieve_server.py index ff30fbc..afe9290 100644 --- a/skills/onprem/rag-retrieve-only/rag_retrieve_server.py +++ b/skills/onprem/rag-retrieve-only/rag_retrieve_server.py @@ -193,7 +193,12 @@ async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]: top_k = arguments.get("top_k", 100) if not query: - return create_error_response(request_id, -32602, "Missing required parameter: query") + return create_success_response(request_id, { + "content": [{ + "type": "text", + "text": "Error: missing required parameter 'query'. Please call this tool again with a non-empty 'query' argument describing what you want to retrieve." + }] + }) result = rag_retrieve(query, top_k) diff --git a/skills/support/rag-retrieve-only/rag_retrieve_server.py b/skills/support/rag-retrieve-only/rag_retrieve_server.py index ff30fbc..afe9290 100644 --- a/skills/support/rag-retrieve-only/rag_retrieve_server.py +++ b/skills/support/rag-retrieve-only/rag_retrieve_server.py @@ -193,7 +193,12 @@ async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]: top_k = arguments.get("top_k", 100) if not query: - return create_error_response(request_id, -32602, "Missing required parameter: query") + return create_success_response(request_id, { + "content": [{ + "type": "text", + "text": "Error: missing required parameter 'query'. Please call this tool again with a non-empty 'query' argument describing what you want to retrieve." + }] + }) result = rag_retrieve(query, top_k) From bb74aee41ba94a8b3514aafe1d736d913165197b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Sun, 7 Jun 2026 08:58:22 +0800 Subject: [PATCH 5/5] add table-query --- skills/developing/table-query/SKILL.md | 137 +++++++++++ .../table-query/scripts/table_query.py | 213 ++++++++++++++++++ skills/developing/table-query/skill.yaml | 25 ++ .../table-query/verify_table_query.sh | 67 ++++++ 4 files changed, 442 insertions(+) create mode 100644 skills/developing/table-query/SKILL.md create mode 100755 skills/developing/table-query/scripts/table_query.py create mode 100644 skills/developing/table-query/skill.yaml create mode 100755 skills/developing/table-query/verify_table_query.sh diff --git a/skills/developing/table-query/SKILL.md b/skills/developing/table-query/SKILL.md new file mode 100644 index 0000000..9a9156a --- /dev/null +++ b/skills/developing/table-query/SKILL.md @@ -0,0 +1,137 @@ +--- +name: table-query +description: Query structured spreadsheet/table data (Excel/CSV) to answer questions about values, prices, quantities, inventory, specifications, rankings, comparisons, summaries, aggregations, lists, or any numeric/tabular lookup. Use this skill whenever the answer likely comes from uploaded tables. You locate tables, read their schema, author SQLite SQL yourself, and run it — the backend does no LLM work, so it is fast. +category: Data & Retrieval +--- + +# Table Query + +Answer table/spreadsheet questions by authoring and running SQLite SQL against the +bot's uploaded Excel data. The backend is a thin, fast SQL executor — **you** do the +thinking (rewrite the question, pick tables, write SQL). Row-level citations +(`__src`) are produced for you. + +## When to use + +Use `table-query` for: values, prices, quantities, inventory, specifications, +rankings, comparisons, summaries, aggregations (sum/avg/count), lists, person / +project / product lookups, monthly/period totals, or any question whose answer +comes from structured tables. For pure concept / definition / policy / explanation +questions, use the `rag_retrieve` document tool instead. + +## Workflow (do this in order, once) + +1. **search-tables** — rewrite the user's question into a retrieval query (core + entity + attributes + synonyms), then locate candidate tables. Call this **once**. +2. **get-schemas** — for the relevant subset of returned tables, fetch their + `CREATE TABLE` schema and sample rows. Never write SQL without seeing the schema. +3. **author SQL** — write a SQLite query plan as JSON (see below). +4. **run-sql** — execute the plan. It returns CSV with an `__src` column and a + `file_ref_table` mapping plus citation instructions. +5. **answer + cite** — write the answer and add `` tags built from + `__src` + `file_ref_table`. Never print the `__src` column to the user. + +### Anti-waste rules + +- Call **search-tables at most once** per question. Do not re-locate tables you + already have schemas for. +- If `run-sql` returns an error, fix the SQL and call **run-sql** again (at most ~2 + tries). Do **NOT** restart from search-tables. +- If `search-tables` finds nothing, fall back to the `rag_retrieve` document tool. + +## Commands + +```bash +# 1. locate tables +python {SKILL_DIR}/scripts/table_query.py search-tables --query "2025 April May June sales total" --top-k 20 + +# 2. read schema + sample rows for the tables you picked +python {SKILL_DIR}/scripts/table_query.py get-schemas --tables "sales_2025,customers" + +# 3. run your authored plan — pipe the JSON plan via stdin (no temp file needed) +python {SKILL_DIR}/scripts/table_query.py run-sql <<'PLAN' +{"queries":[{"step":1,"sql":"CREATE TEMP TABLE \"final_table_step1\" AS SELECT \"month\", SUM(\"amount\") AS \"total\" FROM \"sales_2025\" GROUP BY \"month\"","source_table_names":["sales_2025"],"destine_table_name":"final_table_step1","destine_table_type":"final","destine_table_description":"Monthly totals"}]} +PLAN +``` + +## Authoring the SQL plan + +The plan is a JSON object `{ "queries": [ ... ] }` that you pass to `run-sql` **on +stdin via a quoted heredoc** (`<<'PLAN' ... PLAN`). The quoted delimiter keeps all +the double quotes, single quotes and `$` in your SQL intact — no shell escaping. +(You may instead write it to a file and use `--plan-file path.json` if a plan is very +large, but stdin is the default and needs no extra step.) + +Each query is one SQL step: + +```json +{ + "queries": [ + { + "step": 1, + "sql": "CREATE TEMP TABLE \"final_table_step1\" AS SELECT \"month\", SUM(\"amount\") AS \"total\" FROM \"sales_2025\" WHERE \"month\" IN ('2025-04','2025-05','2025-06') GROUP BY \"month\"", + "source_table_names": ["sales_2025"], + "destine_table_name": "final_table_step1", + "destine_table_type": "final", + "destine_table_description": "Monthly sales totals for Apr-Jun 2025" + } + ] +} +``` + +Field meaning: +- `step`: 1-based execution order. +- `sql`: a SQLite statement, normally `CREATE TEMP TABLE "..." AS SELECT ...`. +- `source_table_names`: tables this step reads (original tables, or earlier steps' + `destine_table_name` for multi-step plans). +- `destine_table_name`: the temp table this step creates. Convention: + `intermediate_table_stepN` or `final_table_stepN`. +- `destine_table_type`: `"final"` for results the user should see, `"intermediate"` + for helper steps. **At least one `final` is required.** +- `destine_table_description`: short human description of the result. + +### SQL rules (important) + +- **Quote every identifier** with double quotes: `"column name"`, `"table name"`. +- String literals use single quotes; escape `'` as `''`. +- Prefer **one logical result per `final` table**. For multiple separate results, + emit multiple `final` tables (e.g. step1, step2) — do **NOT** `UNION` unrelated results. +- For row-level citations to be precise, keep `final` steps as simple single-table + `SELECT`s (no `JOIN` / `GROUP BY` / aggregation). Aggregations still work but the + citation degrades to file+sheet level (`F1S2`) instead of an exact row (`F1S2R5`). +- Multi-step plans run in `step` order: build `intermediate_table_stepN` first, then + read it in a later step. Don't reference a temp table before it is created. +- **Sample rows are a format hint only** — never assume they represent the full data + or the row count. Your SQL must scan the whole table. Use `LIKE '%value%'` for free + text and `=` for enums/codes. + +## Result handling & citations + +- `run-sql` output begins with citation instructions, then `file_ref_table`, then the + result CSV (with `__src`). +- Parse `__src` (`F1S2R5` = file_ref F1, sheet 2, row 5) and `file_ref_table` to build + ``. +- Put citations on their own line **after** the list/table that uses the data; combine + same-(file,sheet) rows into one citation. +- If the result hint says rows were truncated (`Only the first N rows ...; the + remaining M ...`), tell the user the total (`N+M`), shown (`N`), and omitted (`M`). +- Never expose the `__src` column itself to the user. + +### Controlling truncation + +`run-sql` truncates results by default (total rows and per-cell characters) to keep +the context manageable. If a result comes back truncated and you genuinely need more, +re-run with higher limits — do **not** re-run search-tables: + +```bash +python {SKILL_DIR}/scripts/table_query.py run-sql --max-rows 500 --cell-max 4000 <<'PLAN' +{"queries":[ ... ]} +PLAN +``` + +- `--max-rows`: max total rows across all `final` tables (default from backend config, + hard ceiling 2000). Prefer writing an aggregate query (SUM/COUNT/GROUP BY) over + pulling thousands of detail rows. +- `--cell-max`: max characters per cell before it is truncated with `..` (default from + backend config, hard ceiling 10000). Raise this when a long-text column (e.g. a + description/spec field) is getting cut off. diff --git a/skills/developing/table-query/scripts/table_query.py b/skills/developing/table-query/scripts/table_query.py new file mode 100755 index 0000000..b45a121 --- /dev/null +++ b/skills/developing/table-query/scripts/table_query.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python3 +""" +table-query CLI. + +Fast, LLM-free table querying. Talks to the felo-mygpt table_query endpoints: + - search-tables : POST /v1/table_query/search_tables/{bot_id} + - get-schemas : POST /v1/table_query/get_schemas/{bot_id} + - run-sql : POST /v1/table_query/run_sql/{bot_id} + +The agent drives the orchestration (rewrite -> locate -> author SQL -> run); +the backend only does cheap work, so each call returns in seconds. +""" + +import argparse +import hashlib +import json +import os +import sys + +try: + import requests +except ImportError: + print("Error: requests module is required. Please install it with: pip install requests") + sys.exit(1) + +DEFAULT_BACKEND_HOST = os.getenv("BACKEND_HOST", "https://api-dev.gptbase.ai") +DEFAULT_MASTERKEY = os.getenv("MASTERKEY", "master") + +# Same citation contract the legacy table_rag_retrieve used, so the agent's +# behaviour is unchanged. +TABLE_CITATION_INSTRUCTIONS = """ +When using the retrieved table knowledge below, you MUST add XML citation tags for factual claims. + +Format: `` +- Parse `__src`: `F1S2R5` = file_ref F1, sheet 2, row 5 +- Look up file_id in `file_ref_table` +- Combine same-sheet rows into one citation: `rows=[2, 4, 6]` +- MANDATORY: Create SEPARATE citation for EACH (file, sheet) combination +- NEVER put on the same line as a bullet point or table row +- Citations MUST be on separate lines AFTER the complete list/table +- NEVER include the `__src` column in your response - it is internal metadata only +- Citations MUST appear IMMEDIATELY AFTER the paragraph or bullet list that uses the knowledge +- NEVER collect all citations and place them at the end of your response + +""" + + +def load_config() -> dict: + """Load robot_config.json from the robot project root (3 levels up from scripts/).""" + config_path = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'robot_config.json') + if os.path.exists(config_path): + try: + with open(config_path, 'r', encoding='utf-8') as f: + return json.load(f) + except (json.JSONDecodeError, IOError) as e: + print(f"Warning: failed to load robot_config.json: {e}", file=sys.stderr) + return {} + + +def _resolve_bot_id(cli_bot_id: str) -> str: + if cli_bot_id: + return cli_bot_id + return load_config().get('bot_id') or os.getenv("BOT_ID") or os.getenv("ASSISTANT_ID") + + +def _post(path: str, bot_id: str, payload: dict) -> dict: + url = f"{DEFAULT_BACKEND_HOST}/v1/table_query/{path}/{bot_id}" + auth_token = hashlib.md5(f"{DEFAULT_MASTERKEY}:{bot_id}".encode()).hexdigest() + headers = { + "content-type": "application/json", + "authorization": f"Bearer {auth_token}", + } + trace_id = os.getenv("TRACE_ID") or os.getenv("X_REQUEST_ID") + if trace_id: + headers["X-Request-ID"] = trace_id + resp = requests.post(url, json=payload, headers=headers, timeout=30) + if resp.status_code != 200: + raise RuntimeError(f"API {path} returned {resp.status_code}: {resp.text}") + return resp.json() + + +def cmd_search_tables(args, bot_id: str) -> str: + res = _post("search_tables", bot_id, {"query": args.query, "top_k": args.top_k}) + tables = res.get("tables", []) + if not tables: + return ("No matching tables found. If the question may be answered from documents " + "instead of spreadsheets, fall back to the rag_retrieve document tool.") + lines = [f"Found {len(tables)} candidate table(s). Pick the relevant ones and call " + f"`get-schemas` for them next.\n"] + for t in tables: + lines.append( + f"- table_name: {t['table_name']}\n" + f" file: {t.get('file_name','')} | sheet: {t.get('sheet_name','')} " + f"| score: {round(t.get('score', 0), 3)}\n" + f" description: {t.get('table_description','')}" + ) + return "\n".join(lines) + + +def cmd_get_schemas(args, bot_id: str) -> str: + table_names = [t.strip() for t in args.tables.split(',') if t.strip()] + res = _post("get_schemas", bot_id, + {"table_names": table_names, "sample_rows": args.sample_rows}) + schemas = res.get("schemas", []) + missing = res.get("missing_tables", []) + if not schemas: + return f"No schemas resolved. Missing tables: {missing}" + blocks = [] + for s in schemas: + block = [f"### Table: {s['table_name']}", + f"File: {s.get('file_name','')} | Sheet: {s.get('sheet_name','')}", + "```sql", s.get('sql_create', ''), "```"] + sample = s.get('sample_rows') or [] + if sample: + block.append("Sample rows (format hint only, NOT the row count):") + block.append("```csv") + for row in sample: + block.append(",".join('"' + str(c).replace('"', '""') + '"' for c in row)) + block.append("```") + blocks.append("\n".join(block)) + out = "\n\n".join(blocks) + if missing: + out += f"\n\nNote: these requested tables were not found: {missing}" + out += ("\n\nNow author a SQLite plan and run it by piping the JSON to run-sql on stdin:\n" + " run-sql <<'PLAN'\n" + " {\"queries\": [{\"step\": 1, \"sql\": \"CREATE TEMP TABLE \\\"final_table_step1\\\" " + "AS SELECT ...\", \"source_table_names\": [\"...\"], " + "\"destine_table_name\": \"final_table_step1\", \"destine_table_type\": \"final\"}]}\n" + " PLAN\n" + "Quote all identifiers with double quotes.") + return out + + +def cmd_run_sql(args, bot_id: str) -> str: + # Read the plan from --plan-file if given, otherwise from stdin (heredoc). + try: + if args.plan_file: + with open(args.plan_file, 'r', encoding='utf-8') as f: + raw = f.read() + else: + raw = sys.stdin.read() + if not raw.strip(): + return ("Error: no plan provided. Pipe the JSON plan via stdin, e.g.\n" + " python scripts/table_query.py run-sql <<'PLAN'\n" + " {\"queries\": [...]}\n" + " PLAN") + plan = json.loads(raw) + except (json.JSONDecodeError, IOError) as e: + return f"Error: failed to read SQL plan: {e}" + # accept either {"queries": [...]} or a bare [...] list + queries = plan.get("queries") if isinstance(plan, dict) else plan + if not queries: + return "Error: the plan must contain a non-empty `queries` list." + payload = {"queries": queries} + if args.max_rows is not None: + payload["max_rows"] = args.max_rows + if args.cell_max is not None: + payload["cell_max"] = args.cell_max + res = _post("run_sql", bot_id, payload) + if not res.get("success"): + return (f"SQL execution failed: {res.get('error')}\n" + "Fix your SQL and call run-sql again. Do NOT restart from search-tables.") + parts = [TABLE_CITATION_INSTRUCTIONS] + if res.get("instruction"): + parts.append(res["instruction"]) + if res.get("knowledge"): + parts.append(res["knowledge"]) + if res.get("extra_goal"): + parts.append(res["extra_goal"]) + return "\n".join(parts) + + +def main(): + parser = argparse.ArgumentParser(description="table-query: fast LLM-free table querying") + parser.add_argument("--bot-id", default=None, help="Bot id (defaults to robot_config.json)") + sub = parser.add_subparsers(dest="command", required=True) + + p_search = sub.add_parser("search-tables", help="Vector-locate relevant tables") + p_search.add_argument("--query", "-q", required=True, help="Rewritten retrieval query") + p_search.add_argument("--top-k", "-k", type=int, default=20) + + p_schemas = sub.add_parser("get-schemas", help="Fetch CREATE TABLE schema + sample rows") + p_schemas.add_argument("--tables", "-t", required=True, help="Comma-separated table names") + p_schemas.add_argument("--sample-rows", type=int, default=3) + + p_run = sub.add_parser("run-sql", help="Execute an authored SQL plan (JSON via stdin or file)") + p_run.add_argument("--plan-file", "-f", default=None, + help="Path to plan JSON file (optional; defaults to reading stdin)") + p_run.add_argument("--max-rows", type=int, default=None, + help="Max total result rows (raise if a result came back truncated)") + p_run.add_argument("--cell-max", type=int, default=None, + help="Max characters per cell before truncation") + + args = parser.parse_args() + bot_id = _resolve_bot_id(args.bot_id) + if not bot_id: + print("Error: bot_id is required (robot_config.json / --bot-id / BOT_ID env)") + sys.exit(1) + + try: + if args.command == "search-tables": + print(cmd_search_tables(args, bot_id)) + elif args.command == "get-schemas": + print(cmd_get_schemas(args, bot_id)) + elif args.command == "run-sql": + print(cmd_run_sql(args, bot_id)) + except Exception as e: + print(f"Error: {str(e)}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/developing/table-query/skill.yaml b/skills/developing/table-query/skill.yaml new file mode 100644 index 0000000..839dda9 --- /dev/null +++ b/skills/developing/table-query/skill.yaml @@ -0,0 +1,25 @@ +name: table-query +version: 1.0.0 +description: Fast LLM-free table querying. Locate tables, fetch schema, author SQLite SQL, and run it with row-level citations. +author: + name: sparticle + email: support@gbase.ai +license: MIT +tags: + - table + - sql + - excel + - retrieval + - citation +runtime: + python: ">=3.7" + dependencies: + - requests +entry_point: scripts/table_query.py +commands: + search-tables: + description: Vector-locate relevant tables for a query + get-schemas: + description: Fetch CREATE TABLE schema + sample rows for given tables + run-sql: + description: Execute an authored SQLite plan and return CSV with __src citations diff --git a/skills/developing/table-query/verify_table_query.sh b/skills/developing/table-query/verify_table_query.sh new file mode 100755 index 0000000..f6de962 --- /dev/null +++ b/skills/developing/table-query/verify_table_query.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# +# Manual verification for the new table_query endpoints. +# Run this against an environment where the feature/table-query-split branch is +# deployed (e.g. dev). It checks the 3 fast endpoints and diffs run_sql output +# against the legacy table_rag_retrieve for parity. +# +# Usage: +# HOST=https://api-dev.gptbase.ai BOT_ID= MASTERKEY=master ./verify_table_query.sh +# +set -euo pipefail + +HOST="${HOST:-https://api-dev.gptbase.ai}" +# bot from the slow-request log (has the 案1_売上明細 xlsx). Override as needed. +BOT_ID="${BOT_ID:-c1fa021b-6c41-41d5-b1e6-adfb8896aaaa}" +MASTERKEY="${MASTERKEY:-master}" +QUERY="${QUERY:-2025年4月〜6月の売上実績}" + +# auth token = MD5(masterkey:bot_id) +TOKEN=$(python3 -c "import hashlib,sys;print(hashlib.md5(f'{sys.argv[1]}:{sys.argv[2]}'.encode()).hexdigest())" "$MASTERKEY" "$BOT_ID") +AUTH="authorization: Bearer ${TOKEN}" +CT="content-type: application/json" + +echo "=== HOST=$HOST BOT_ID=$BOT_ID ===" + +echo +echo "### 1) search_tables ###" +curl -s --request POST "$HOST/v1/table_query/search_tables/$BOT_ID" \ + --header "$AUTH" --header "$CT" \ + --data "{\"query\": \"$QUERY\", \"top_k\": 20}" | python3 -m json.tool + +echo +echo "### 2) get_schemas (EDIT --data table_names with names from step 1) ###" +echo "curl -s --request POST \"$HOST/v1/table_query/get_schemas/$BOT_ID\" \\" +echo " --header \"$AUTH\" --header \"$CT\" \\" +echo " --data '{\"table_names\": [\"\"], \"sample_rows\": 3}' | python3 -m json.tool" + +echo +echo "### 3) run_sql (EDIT the sql to match the real table/columns from step 2) ###" +cat > /tmp/tq_plan.json <<'JSON' +{ + "queries": [ + { + "step": 1, + "sql": "CREATE TEMP TABLE \"final_table_step1\" AS SELECT \"計上日\", \"得意先名\", \"売上金額\" FROM \"\" LIMIT 10", + "source_table_names": [""], + "destine_table_name": "final_table_step1", + "destine_table_type": "final", + "destine_table_description": "sample rows" + } + ] +} +JSON +echo "Edit /tmp/tq_plan.json (replace ), then:" +echo "curl -s --request POST \"$HOST/v1/table_query/run_sql/$BOT_ID\" \\" +echo " --header \"$AUTH\" --header \"$CT\" \\" +echo " --data @/tmp/tq_plan.json | python3 -m json.tool" +echo +echo "ASSERT: run_sql output 'knowledge' contains a '__src' column and 'file_ref_table'." + +echo +echo "### 4) legacy table_rag_retrieve (parity reference, same question) ###" +echo "curl -s --request POST \"$HOST/v1/table_rag_retrieve/$BOT_ID\" \\" +echo " --header \"$AUTH\" --header \"$CT\" \\" +echo " --data '{\"query\": \"$QUERY\"}' | python3 -m json.tool" +echo +echo "Compare the __src tokens / result rows between #3 and #4 for the same SQL intent."