"""Drug-category taxonomy loader. Reads `pmda/drug_category.md` (the cleaned-up nested list with codes) and produces a `{l2_code: L2}` dict for joining with `DocMeta.l2_code` (first 3 chars of the YJ code). Source markdown shape: - 11 中枢神経系用薬 - 111 全身麻酔剤 - 112 催眠鎮静剤,抗不安剤 - 12 末梢神経用薬 - 121 局所麻酔剤 """ from __future__ import annotations import re from dataclasses import dataclass from pathlib import Path # Top-level: `- {2-digit} {name}` _L1_RE = re.compile(r"^- (\d{2})\s+(.+)$") # Nested: ` - {3-digit} {name}` (indent 2 spaces) _L2_RE = re.compile(r"^ {2}- (\d{3})\s+(.+)$") @dataclass(frozen=True) class L2: code: str # "111" name: str # "全身麻酔剤" l1_code: str # "11" l1_name: str # "中枢神経系用薬" def load_taxonomy(path: Path | str = "pmda/drug_category.md") -> dict[str, L2]: out: dict[str, L2] = {} current_l1_code = "" current_l1_name = "" for line in Path(path).read_text(encoding="utf-8").splitlines(): m1 = _L1_RE.match(line) if m1: current_l1_code, current_l1_name = m1.group(1), m1.group(2).strip() continue m2 = _L2_RE.match(line) if m2: code = m2.group(1) name = m2.group(2).strip() if not current_l1_code: raise ValueError(f"L2 row {code} appears before any L1 in {path}") out[code] = L2(code=code, name=name, l1_code=current_l1_code, l1_name=current_l1_name) return out def lookup(taxonomy: dict[str, L2], l2_code: str) -> L2 | None: """Return the L2 entry, or None if the YJ prefix isn't in the taxonomy.""" return taxonomy.get(l2_code) if __name__ == "__main__": t = load_taxonomy() print(f"Loaded {len(t)} L2 categories") for code in ("111", "214", "421", "999"): v = t.get(code) print(f" {code} → {v}")