qwen_agent/skills/developing/mineru/scripts/sinks/_http.py
2026-06-05 14:35:17 +08:00

73 lines
2.6 KiB
Python

"""Zero-dependency HTTP helpers shared by all sinks (stdlib urllib only).
``http_request`` is the single seam tests monkeypatch.
"""
from __future__ import annotations
import json
import mimetypes
import urllib.error
import urllib.request
from typing import Optional
USER_AGENT = "MinerU-Skill-sink/1.0"
def http_request(method, url, *, headers=None, data=None, timeout=60):
"""Perform one HTTP request. Returns ``(status_code, body_bytes)``."""
req = urllib.request.Request(url, data=data, method=method, headers=headers or {})
req.add_header("User-Agent", USER_AGENT)
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
return resp.getcode(), resp.read()
except urllib.error.HTTPError as exc:
body = exc.read() if hasattr(exc, "read") else b""
return exc.code, body
def request_json(method, url, *, headers=None, payload=None, timeout=60):
"""JSON request helper. Returns ``(status_code, parsed_json_or_empty_dict)``."""
hdrs = dict(headers or {})
body = None
if payload is not None:
hdrs.setdefault("Content-Type", "application/json")
body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
status, raw = http_request(method, url, headers=hdrs, data=body, timeout=timeout)
parsed: dict = {}
if raw:
try:
parsed = json.loads(raw.decode("utf-8"))
except (ValueError, UnicodeDecodeError):
parsed = {}
return status, parsed
def encode_multipart(fields=None, files=None):
"""Build a ``multipart/form-data`` body with stdlib only.
``fields``: dict of str -> str. ``files``: list of (field_name, filename, bytes).
Returns ``(content_type, body_bytes)``.
"""
boundary = "----MinerUSinkBoundary7MA4YWxkTrZu0gW"
crlf = b"\r\n"
parts = []
for name, value in (fields or {}).items():
parts.append(b"--" + boundary.encode())
parts.append(f'Content-Disposition: form-data; name="{name}"'.encode())
parts.append(b"")
parts.append(str(value).encode("utf-8"))
for field_name, filename, content in files or []:
ctype = mimetypes.guess_type(filename)[0] or "application/octet-stream"
parts.append(b"--" + boundary.encode())
parts.append(
f'Content-Disposition: form-data; name="{field_name}"; filename="{filename}"'.encode()
)
parts.append(f"Content-Type: {ctype}".encode())
parts.append(b"")
parts.append(content)
parts.append(b"--" + boundary.encode() + b"--")
parts.append(b"")
body = crlf.join(parts)
return f"multipart/form-data; boundary={boundary}", body