"""Small, dependency-free Markdown utilities used by sinks. These are intentionally pragmatic, not a full CommonMark implementation: they cover the constructs MinerU emits (headings, emphasis, code, lists, tables, blockquotes, links, images) well enough to deliver faithful content to tools that require HTML (Confluence, OneNote) or an outline (Logseq). """ from __future__ import annotations import html import re from pathlib import Path from typing import Optional _IMAGE_RE = re.compile(r"!\[(?P[^\]]*)\]\((?P[^)\s]+)(?:\s+\"[^\"]*\")?\)") _ILLEGAL_FS = re.compile(r'[\\/:*?"<>|#^\[\]]+') def slugify(text: str, default: str = "document") -> str: """Filesystem/URL-safe slug.""" text = text.strip().lower() text = re.sub(r"[\s_]+", "-", text) text = re.sub(r"[^a-z0-9\-]+", "", text) text = re.sub(r"-{2,}", "-", text).strip("-") return text or default def safe_filename(title: str, default: str = "document") -> str: """Clean a title into a safe note filename (keeps unicode, drops illegal chars).""" name = _ILLEGAL_FS.sub(" ", title).strip() name = re.sub(r"\s{2,}", " ", name) return name[:120] or default def is_remote(ref: str) -> bool: return ref.startswith("http://") or ref.startswith("https://") or ref.startswith("data:") def find_local_images(markdown: str, base_dir) -> list: """Return ``[(alt, ref, Path)]`` for image refs that point at existing local files.""" base = Path(base_dir) if base_dir else None found = [] seen = set() for match in _IMAGE_RE.finditer(markdown): ref = match.group("ref") if is_remote(ref) or ref in seen: continue path = Path(ref) if not path.is_absolute() and base is not None: path = base / ref if path.is_file(): found.append((match.group("alt"), ref, path)) seen.add(ref) return found def rewrite_images(markdown: str, mapping: dict) -> str: """Rewrite local image refs using ``{old_ref: new_ref}``.""" def repl(match): ref = match.group("ref") if ref in mapping: return f"![{match.group('alt')}]({mapping[ref]})" return match.group(0) return _IMAGE_RE.sub(repl, markdown) def yaml_frontmatter(props: dict) -> str: """Render a YAML frontmatter block. List values become ``- item`` lines.""" lines = ["---"] for key, value in props.items(): if value is None or value == "" or value == []: continue if isinstance(value, (list, tuple)): lines.append(f"{key}:") for item in value: lines.append(f" - {item}") else: lines.append(f"{key}: {value}") lines.append("---") return "\n".join(lines) # --------------------------------------------------------------------------- # # Inline + block Markdown -> HTML (pragmatic, XHTML-safe) # --------------------------------------------------------------------------- # def _inline(text: str) -> str: """Convert inline Markdown to HTML on already-escaped text.""" # images first, then links text = _IMAGE_RE.sub( lambda m: f'{m.group(', text, ) text = re.sub(r"\[([^\]]+)\]\(([^)\s]+)\)", lambda m: f'{m.group(1)}', text) text = re.sub(r"`([^`]+)`", r"\1", text) text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) text = re.sub(r"(?\1", text) return text def md_to_html(markdown: str) -> str: """Convert a Markdown document to a pragmatic, XHTML-safe HTML fragment.""" out = [] lines = markdown.replace("\r\n", "\n").split("\n") i = 0 n = len(lines) in_code = False code_buf: list = [] list_stack: list = [] # 'ul' / 'ol' def close_lists(): while list_stack: out.append(f"") while i < n: line = lines[i] fence = line.strip().startswith("```") if fence and not in_code: close_lists() in_code = True code_buf = [] i += 1 continue if fence and in_code: out.append("
" + html.escape("\n".join(code_buf)) + "
") in_code = False i += 1 continue if in_code: code_buf.append(line) i += 1 continue stripped = line.strip() if not stripped: close_lists() i += 1 continue # table block if "|" in stripped and i + 1 < n and re.match(r"^\s*\|?[\s:|-]+\|?\s*$", lines[i + 1]): close_lists() header = [c.strip() for c in stripped.strip("|").split("|")] rows = [] i += 2 while i < n and "|" in lines[i] and lines[i].strip(): rows.append([c.strip() for c in lines[i].strip().strip("|").split("|")]) i += 1 out.append("" + "".join(f"" for c in header) + "") for row in rows: out.append("" + "".join(f"" for c in row) + "") out.append("
{_inline(html.escape(c))}
{_inline(html.escape(c))}
") continue heading = re.match(r"^(#{1,6})\s+(.*)$", stripped) if heading: close_lists() level = len(heading.group(1)) out.append(f"{_inline(html.escape(heading.group(2)))}") i += 1 continue if stripped.startswith(">"): close_lists() out.append(f"
{_inline(html.escape(stripped[1:].strip()))}
") i += 1 continue if re.match(r"^([-*+])\s+", stripped): if not list_stack or list_stack[-1] != "ul": close_lists() list_stack.append("ul") out.append("