qwen_agent/skills/developing/pmda-drug-info/os_client.py

"""OpenSearch `pmda_sections` index spec + client helper.

Mapping 与 wiki-skill 的 sudachi 配置共用 plugin（同一 OS 集群、同一 sudachi
core 字典）。每个 doc 对应一份说明书的一个章节节点，冗余存药品 metadata 以避
免 JOIN（详见 design.md §2.1.2）。

环境变量：
  OS_HOST       (默认 http://localhost:9200，与 wiki-skill `_common.py` 一致)
  PMDA_OS_INDEX (默认 pmda_sections)
"""
from __future__ import annotations

import os

from opensearchpy import OpenSearch

# Plugin env vars: PMDA_OPENSEARCH_URL（推奨）/ OPENSEARCH_URL / OPENSEARCH_HOST
OS_HOST = (
    os.environ.get("PMDA_OPENSEARCH_URL")
    or os.environ.get("OPENSEARCH_URL")
    or os.environ.get("OPENSEARCH_HOST")
    or "http://localhost:9200"
)
INDEX_NAME = os.environ.get("PMDA_OS_INDEX", "pmda_sections")


# ---- Mapping spec --------------------------------------------------------

INDEX_BODY: dict = {
    "settings": {
        "index": {
            "number_of_shards": 1,
            "number_of_replicas": 0,
            "refresh_interval": "1s",
        },
        "analysis": {
            "tokenizer": {
                "sudachi_tokenizer": {
                    "type": "sudachi_tokenizer",
                    "split_mode": "C",
                    "discard_punctuation": True,
                }
            },
            "filter": {
                # "med_synonyms": {
                #     "type": "synonym",
                    # 初期最小集 — 命中错例后扩充。同义词条之间逗号分隔代表
                    # 等价、空格视为词内字符。
                    # "synonyms": [
                    #     "Stevens-Johnson, 皮膚粘膜眼症候群, SJS",
                    #     "中毒性表皮壊死融解症, TEN, ライエル症候群",
                    #     "QT延長, トルサード, Torsades de pointes",
                    #     "間質性肺炎, 肺臓炎",
                    #     "横紋筋融解症, ラブドミオリーシス",
                    #     "アナフィラキシー, アナフィラキシーショック",
                    #     "無顆粒球症, 顆粒球減少症",
                    # ],
                # },
                "jp_pos": {
                    "type": "sudachi_part_of_speech",
                },
                "jp_stop": {
                    "type": "sudachi_ja_stop",
                },
            },
            "analyzer": {
                "jp_med": {
                    "type": "custom",
                    # icu_normalizer はデフォルト image に未含、sudachi_
                    # normalizedform で全角半角・正規化はカバーされる。
                    "tokenizer": "sudachi_tokenizer",
                    "filter": [
                        "sudachi_baseform",
                        "sudachi_normalizedform",
                        "jp_pos",
                        "jp_stop",
                        "lowercase",
                    ],
                }
            },
        },
    },
    "mappings": {
        "properties": {
            "yj_full":       {"type": "keyword"},
            "yj_code":       {"type": "keyword"},
            "l1_code":       {"type": "keyword"},
            "l2_code":       {"type": "keyword"},
            "l2_name":       {"type": "keyword"},
            "category_name": {"type": "keyword"},
            "brand_names":   {"type": "keyword"},
            "generic_name":  {"type": "keyword"},
            "section_title": {
                "type": "text",
                "analyzer": "jp_med",
                "fields": {"raw": {"type": "keyword"}},
            },
            "line_num":      {"type": "integer"},
            "text":          {"type": "text", "analyzer": "jp_med"},
            "revision_date": {"type": "date"},
            "_md_sha256":    {"type": "keyword"},
        }
    },
}


# ---- Client --------------------------------------------------------------


def client() -> OpenSearch:
    """Return an OpenSearch client bound to OS_HOST."""
    return OpenSearch(hosts=[OS_HOST], http_compress=True, timeout=60)


# ---- 章節アクセス helpers（PageIndex 退役後の verbatim 取得経路） -------


def list_drug_sections(yj_full: str, *, limit: int = 200) -> list[dict]:
    """1 薬の全章節を line_num 昇順で返す。

    各 element: {section_title, line_num, text_len, brand, generic}
    """
    cli = client()
    resp = cli.search(index=INDEX_NAME, body={
        "size": min(limit, 500),
        "_source": ["section_title", "line_num", "text", "brand_names", "generic_name"],
        "query": {"term": {"yj_full": yj_full}},
        "sort": [{"line_num": "asc"}],
    })
    out = []
    for h in resp["hits"]["hits"]:
        s = h["_source"]
        out.append({
            "section_title": s.get("section_title", ""),
            "line_num": s.get("line_num"),
            "text_len": len(s.get("text", "") or ""),
            "brand": (s.get("brand_names") or [""])[0],
            "generic": s.get("generic_name") or "",
        })
    return out


def get_drug_section_text(yj_full: str, section_title: str) -> str:
    """指定 (yj_full, section_title) の verbatim 章節 text。見つからなければ ""。"""
    cli = client()
    resp = cli.search(index=INDEX_NAME, body={
        "size": 1,
        "_source": ["text"],
        "query": {"bool": {"must": [
            {"term": {"yj_full": yj_full}},
            {"term": {"section_title.raw": section_title}},
        ]}},
    })
    hits = resp["hits"]["hits"]
    if not hits:
        return ""
    return hits[0]["_source"].get("text", "") or ""