"""OpenSearch `pmda_sections` index spec + client helper. Mapping 与 wiki-skill 的 sudachi 配置共用 plugin(同一 OS 集群、同一 sudachi core 字典)。每个 doc 对应一份说明书的一个章节节点,冗余存药品 metadata 以避 免 JOIN(详见 design.md §2.1.2)。 环境变量: OS_HOST (默认 http://localhost:9200,与 wiki-skill `_common.py` 一致) PMDA_OS_INDEX (默认 pmda_sections) """ from __future__ import annotations import os from opensearchpy import OpenSearch # Plugin env vars: PMDA_OPENSEARCH_URL(推奨)/ OPENSEARCH_URL / OPENSEARCH_HOST OS_HOST = ( os.environ.get("PMDA_OPENSEARCH_URL") or os.environ.get("OPENSEARCH_URL") or os.environ.get("OPENSEARCH_HOST") or "http://localhost:9200" ) INDEX_NAME = os.environ.get("PMDA_OS_INDEX", "pmda_sections") # ---- Mapping spec -------------------------------------------------------- INDEX_BODY: dict = { "settings": { "index": { "number_of_shards": 1, "number_of_replicas": 0, "refresh_interval": "1s", }, "analysis": { "tokenizer": { "sudachi_tokenizer": { "type": "sudachi_tokenizer", "split_mode": "C", "discard_punctuation": True, } }, "filter": { # "med_synonyms": { # "type": "synonym", # 初期最小集 — 命中错例后扩充。同义词条之间逗号分隔代表 # 等价、空格视为词内字符。 # "synonyms": [ # "Stevens-Johnson, 皮膚粘膜眼症候群, SJS", # "中毒性表皮壊死融解症, TEN, ライエル症候群", # "QT延長, トルサード, Torsades de pointes", # "間質性肺炎, 肺臓炎", # "横紋筋融解症, ラブドミオリーシス", # "アナフィラキシー, アナフィラキシーショック", # "無顆粒球症, 顆粒球減少症", # ], # }, "jp_pos": { "type": "sudachi_part_of_speech", }, "jp_stop": { "type": "sudachi_ja_stop", }, }, "analyzer": { "jp_med": { "type": "custom", # icu_normalizer はデフォルト image に未含、sudachi_ # normalizedform で全角半角・正規化はカバーされる。 "tokenizer": "sudachi_tokenizer", "filter": [ "sudachi_baseform", "sudachi_normalizedform", "jp_pos", "jp_stop", "lowercase", ], } }, }, }, "mappings": { "properties": { "yj_full": {"type": "keyword"}, "yj_code": {"type": "keyword"}, "l1_code": {"type": "keyword"}, "l2_code": {"type": "keyword"}, "l2_name": {"type": "keyword"}, "category_name": {"type": "keyword"}, "brand_names": {"type": "keyword"}, "generic_name": {"type": "keyword"}, "section_title": { "type": "text", "analyzer": "jp_med", "fields": {"raw": {"type": "keyword"}}, }, "line_num": {"type": "integer"}, "text": {"type": "text", "analyzer": "jp_med"}, "revision_date": {"type": "date"}, "_md_sha256": {"type": "keyword"}, } }, } # ---- Client -------------------------------------------------------------- def client() -> OpenSearch: """Return an OpenSearch client bound to OS_HOST.""" return OpenSearch(hosts=[OS_HOST], http_compress=True, timeout=60) # ---- 章節アクセス helpers(PageIndex 退役後の verbatim 取得経路) ------- def list_drug_sections(yj_full: str, *, limit: int = 200) -> list[dict]: """1 薬の全章節を line_num 昇順で返す。 各 element: {section_title, line_num, text_len, brand, generic} """ cli = client() resp = cli.search(index=INDEX_NAME, body={ "size": min(limit, 500), "_source": ["section_title", "line_num", "text", "brand_names", "generic_name"], "query": {"term": {"yj_full": yj_full}}, "sort": [{"line_num": "asc"}], }) out = [] for h in resp["hits"]["hits"]: s = h["_source"] out.append({ "section_title": s.get("section_title", ""), "line_num": s.get("line_num"), "text_len": len(s.get("text", "") or ""), "brand": (s.get("brand_names") or [""])[0], "generic": s.get("generic_name") or "", }) return out def get_drug_section_text(yj_full: str, section_title: str) -> str: """指定 (yj_full, section_title) の verbatim 章節 text。見つからなければ ""。""" cli = client() resp = cli.search(index=INDEX_NAME, body={ "size": 1, "_source": ["text"], "query": {"bool": {"must": [ {"term": {"yj_full": yj_full}}, {"term": {"section_title.raw": section_title}}, ]}}, }) hits = resp["hits"]["hits"] if not hits: return "" return hits[0]["_source"].get("text", "") or ""