qwen_agent/skills/developing/pmda-drug-info/taxonomy.py

"""Drug-category taxonomy loader.

Reads `pmda/drug_category.md` (the cleaned-up nested list with codes) and
produces a `{l2_code: L2}` dict for joining with `DocMeta.l2_code` (first 3
chars of the YJ code).

Source markdown shape:

    - 11 中枢神経系用薬
      - 111 全身麻酔剤
      - 112 催眠鎮静剤，抗不安剤
    - 12 末梢神経用薬
      - 121 局所麻酔剤
"""
from __future__ import annotations

import re
from dataclasses import dataclass
from pathlib import Path

# Top-level: `- {2-digit} {name}`
_L1_RE = re.compile(r"^- (\d{2})\s+(.+)$")
# Nested: `  - {3-digit} {name}` (indent 2 spaces)
_L2_RE = re.compile(r"^ {2}- (\d{3})\s+(.+)$")


@dataclass(frozen=True)
class L2:
    code: str  # "111"
    name: str  # "全身麻酔剤"
    l1_code: str  # "11"
    l1_name: str  # "中枢神経系用薬"


def load_taxonomy(path: Path | str = "pmda/drug_category.md") -> dict[str, L2]:
    out: dict[str, L2] = {}
    current_l1_code = ""
    current_l1_name = ""
    for line in Path(path).read_text(encoding="utf-8").splitlines():
        m1 = _L1_RE.match(line)
        if m1:
            current_l1_code, current_l1_name = m1.group(1), m1.group(2).strip()
            continue
        m2 = _L2_RE.match(line)
        if m2:
            code = m2.group(1)
            name = m2.group(2).strip()
            if not current_l1_code:
                raise ValueError(f"L2 row {code} appears before any L1 in {path}")
            out[code] = L2(code=code, name=name, l1_code=current_l1_code, l1_name=current_l1_name)
    return out


def lookup(taxonomy: dict[str, L2], l2_code: str) -> L2 | None:
    """Return the L2 entry, or None if the YJ prefix isn't in the taxonomy."""
    return taxonomy.get(l2_code)


if __name__ == "__main__":
    t = load_taxonomy()
    print(f"Loaded {len(t)} L2 categories")
    for code in ("111", "214", "421", "999"):
        v = t.get(code)
        print(f"  {code} → {v}")