qwen_agent/skills/developing/pmda-drug-info/taxonomy.py
2026-06-12 11:03:30 +08:00

65 lines
1.9 KiB
Python

"""Drug-category taxonomy loader.
Reads `pmda/drug_category.md` (the cleaned-up nested list with codes) and
produces a `{l2_code: L2}` dict for joining with `DocMeta.l2_code` (first 3
chars of the YJ code).
Source markdown shape:
- 11 中枢神経系用薬
- 111 全身麻酔剤
- 112 催眠鎮静剤,抗不安剤
- 12 末梢神経用薬
- 121 局所麻酔剤
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from pathlib import Path
# Top-level: `- {2-digit} {name}`
_L1_RE = re.compile(r"^- (\d{2})\s+(.+)$")
# Nested: ` - {3-digit} {name}` (indent 2 spaces)
_L2_RE = re.compile(r"^ {2}- (\d{3})\s+(.+)$")
@dataclass(frozen=True)
class L2:
code: str # "111"
name: str # "全身麻酔剤"
l1_code: str # "11"
l1_name: str # "中枢神経系用薬"
def load_taxonomy(path: Path | str = "pmda/drug_category.md") -> dict[str, L2]:
out: dict[str, L2] = {}
current_l1_code = ""
current_l1_name = ""
for line in Path(path).read_text(encoding="utf-8").splitlines():
m1 = _L1_RE.match(line)
if m1:
current_l1_code, current_l1_name = m1.group(1), m1.group(2).strip()
continue
m2 = _L2_RE.match(line)
if m2:
code = m2.group(1)
name = m2.group(2).strip()
if not current_l1_code:
raise ValueError(f"L2 row {code} appears before any L1 in {path}")
out[code] = L2(code=code, name=name, l1_code=current_l1_code, l1_name=current_l1_name)
return out
def lookup(taxonomy: dict[str, L2], l2_code: str) -> L2 | None:
"""Return the L2 entry, or None if the YJ prefix isn't in the taxonomy."""
return taxonomy.get(l2_code)
if __name__ == "__main__":
t = load_taxonomy()
print(f"Loaded {len(t)} L2 categories")
for code in ("111", "214", "421", "999"):
v = t.get(code)
print(f" {code}{v}")