105 lines
3.9 KiB
Python
105 lines
3.9 KiB
Python
"""WPS / 金山文档 (Kingsoft kdocs) sink — optional dependency.
|
|
|
|
The native ingestion path is: Markdown → ``.docx`` → upload to the kdocs cloud
|
|
appspace. There is no official Python SDK, so:
|
|
|
|
* Markdown→DOCX uses the maintained, pure-pip ``html-for-docx`` package
|
|
(reusing this project's Markdown→HTML), lazily imported so the core stays
|
|
zero-dependency. Install with ``pip install mineru-skill[wps]``.
|
|
* The kdocs WPS-2 request signing (plain SHA-1) and multipart upload are done
|
|
with the standard library — small and fully documented.
|
|
|
|
Cloud upload requires an approved kdocs developer app (``WPS_APP_ID`` /
|
|
``WPS_APP_SECRET``) and a provisioned appspace; it is opt-in and surfaces the
|
|
raw kdocs error on failure. Docs: https://developer.kdocs.cn/server/guide/signature.html
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import email.utils
|
|
import hashlib
|
|
import io
|
|
import json
|
|
|
|
from . import _http, _md
|
|
from .base import ParsedDoc, Sink, SinkError, SinkResult, register
|
|
|
|
KDOCS_UPLOAD = "https://developer.kdocs.cn/api/v1/openapi/appspace/files/upload"
|
|
|
|
|
|
def _markdown_to_docx_bytes(markdown: str) -> bytes:
|
|
"""Convert Markdown → HTML → DOCX bytes via the optional html-for-docx lib."""
|
|
try:
|
|
from html4docx import HtmlToDocx # pip install html-for-docx
|
|
except ImportError as exc: # pragma: no cover - exercised via SinkError path
|
|
raise SinkError(
|
|
"WPS sink needs a Markdown→DOCX converter — "
|
|
"pip install 'mineru-skill[wps]' (i.e. pip install html-for-docx)"
|
|
) from exc
|
|
html = _md.md_to_html(markdown)
|
|
document = HtmlToDocx().parse_html_string(html)
|
|
buf = io.BytesIO()
|
|
document.save(buf)
|
|
return buf.getvalue()
|
|
|
|
|
|
def _wps2_headers(app_id: str, app_secret: str, body: bytes, content_type: str) -> dict:
|
|
"""Build kdocs WPS-2 auth headers.
|
|
|
|
signature = sha1(app_secret + content_md5 + content_type + date) hex.
|
|
Content-Md5 / Content-Type must match the exact wire body and header sent.
|
|
"""
|
|
content_md5 = hashlib.md5(body).hexdigest()
|
|
date = email.utils.formatdate(usegmt=True) # RFC1123 GMT
|
|
signature = hashlib.sha1(
|
|
(app_secret + content_md5 + content_type + date).encode("utf-8")
|
|
).hexdigest()
|
|
return {
|
|
"Date": date,
|
|
"Content-Md5": content_md5,
|
|
"Content-Type": content_type,
|
|
"Authorization": f"WPS-2:{app_id}:{signature}",
|
|
}
|
|
|
|
|
|
@register
|
|
class WpsSink(Sink):
|
|
name = "wps"
|
|
aliases = ("kdocs", "金山文档", "金山")
|
|
requires = ("WPS_APP_ID", "WPS_APP_SECRET")
|
|
label = "WPS / 金山文档 (Markdown→DOCX upload, optional dep)"
|
|
|
|
def deliver(self, doc: ParsedDoc) -> SinkResult:
|
|
app_id = self.env("WPS_APP_ID")
|
|
app_secret = self.env("WPS_APP_SECRET")
|
|
|
|
docx_bytes = _markdown_to_docx_bytes(doc.markdown)
|
|
filename = _md.safe_filename(doc.title) + ".docx"
|
|
|
|
fields = {}
|
|
parent_path = self.env("WPS_PARENT_PATH")
|
|
parent_token = self.env("WPS_PARENT_TOKEN")
|
|
if parent_path:
|
|
fields["parent_path"] = parent_path
|
|
if parent_token:
|
|
fields["parent_token"] = parent_token
|
|
|
|
content_type, body = _http.encode_multipart(
|
|
fields=fields, files=[("file", filename, docx_bytes)]
|
|
)
|
|
headers = _wps2_headers(app_id, app_secret, body, content_type)
|
|
|
|
status, raw = _http.http_request("POST", KDOCS_UPLOAD, headers=headers, data=body)
|
|
try:
|
|
parsed = json.loads(raw.decode("utf-8")) if raw else {}
|
|
except (ValueError, UnicodeDecodeError):
|
|
parsed = {}
|
|
if status >= 400 or parsed.get("code") not in (0, None):
|
|
raise SinkError(parsed.get("message") or parsed.get("msg") or f"kdocs HTTP {status}")
|
|
|
|
file_token = (parsed.get("data") or {}).get("file_token")
|
|
return SinkResult(
|
|
sink=self.name, ok=True, url=file_token,
|
|
detail="Markdown→DOCX uploaded to 金山文档 (experimental; needs a provisioned appspace)",
|
|
)
|