158 lines
5.6 KiB
Python
158 lines
5.6 KiB
Python
"""OpenSearch `pmda_sections` index spec + client helper.
|
||
|
||
Mapping 与 wiki-skill 的 sudachi 配置共用 plugin(同一 OS 集群、同一 sudachi
|
||
core 字典)。每个 doc 对应一份说明书的一个章节节点,冗余存药品 metadata 以避
|
||
免 JOIN(详见 design.md §2.1.2)。
|
||
|
||
环境变量:
|
||
OS_HOST (默认 http://localhost:9200,与 wiki-skill `_common.py` 一致)
|
||
PMDA_OS_INDEX (默认 pmda_sections)
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
|
||
from opensearchpy import OpenSearch
|
||
|
||
# Plugin env vars: PMDA_OPENSEARCH_URL(推奨)/ OPENSEARCH_URL / OPENSEARCH_HOST
|
||
OS_HOST = (
|
||
os.environ.get("PMDA_OPENSEARCH_URL")
|
||
or os.environ.get("OPENSEARCH_URL")
|
||
or os.environ.get("OPENSEARCH_HOST")
|
||
or "http://localhost:9200"
|
||
)
|
||
INDEX_NAME = os.environ.get("PMDA_OS_INDEX", "pmda_sections")
|
||
|
||
|
||
# ---- Mapping spec --------------------------------------------------------
|
||
|
||
INDEX_BODY: dict = {
|
||
"settings": {
|
||
"index": {
|
||
"number_of_shards": 1,
|
||
"number_of_replicas": 0,
|
||
"refresh_interval": "1s",
|
||
},
|
||
"analysis": {
|
||
"tokenizer": {
|
||
"sudachi_tokenizer": {
|
||
"type": "sudachi_tokenizer",
|
||
"split_mode": "C",
|
||
"discard_punctuation": True,
|
||
}
|
||
},
|
||
"filter": {
|
||
# "med_synonyms": {
|
||
# "type": "synonym",
|
||
# 初期最小集 — 命中错例后扩充。同义词条之间逗号分隔代表
|
||
# 等价、空格视为词内字符。
|
||
# "synonyms": [
|
||
# "Stevens-Johnson, 皮膚粘膜眼症候群, SJS",
|
||
# "中毒性表皮壊死融解症, TEN, ライエル症候群",
|
||
# "QT延長, トルサード, Torsades de pointes",
|
||
# "間質性肺炎, 肺臓炎",
|
||
# "横紋筋融解症, ラブドミオリーシス",
|
||
# "アナフィラキシー, アナフィラキシーショック",
|
||
# "無顆粒球症, 顆粒球減少症",
|
||
# ],
|
||
# },
|
||
"jp_pos": {
|
||
"type": "sudachi_part_of_speech",
|
||
},
|
||
"jp_stop": {
|
||
"type": "sudachi_ja_stop",
|
||
},
|
||
},
|
||
"analyzer": {
|
||
"jp_med": {
|
||
"type": "custom",
|
||
# icu_normalizer はデフォルト image に未含、sudachi_
|
||
# normalizedform で全角半角・正規化はカバーされる。
|
||
"tokenizer": "sudachi_tokenizer",
|
||
"filter": [
|
||
"sudachi_baseform",
|
||
"sudachi_normalizedform",
|
||
"jp_pos",
|
||
"jp_stop",
|
||
"lowercase",
|
||
],
|
||
}
|
||
},
|
||
},
|
||
},
|
||
"mappings": {
|
||
"properties": {
|
||
"yj_full": {"type": "keyword"},
|
||
"yj_code": {"type": "keyword"},
|
||
"l1_code": {"type": "keyword"},
|
||
"l2_code": {"type": "keyword"},
|
||
"l2_name": {"type": "keyword"},
|
||
"category_name": {"type": "keyword"},
|
||
"brand_names": {"type": "keyword"},
|
||
"generic_name": {"type": "keyword"},
|
||
"section_title": {
|
||
"type": "text",
|
||
"analyzer": "jp_med",
|
||
"fields": {"raw": {"type": "keyword"}},
|
||
},
|
||
"line_num": {"type": "integer"},
|
||
"text": {"type": "text", "analyzer": "jp_med"},
|
||
"revision_date": {"type": "date"},
|
||
"_md_sha256": {"type": "keyword"},
|
||
}
|
||
},
|
||
}
|
||
|
||
|
||
# ---- Client --------------------------------------------------------------
|
||
|
||
|
||
def client() -> OpenSearch:
|
||
"""Return an OpenSearch client bound to OS_HOST."""
|
||
return OpenSearch(hosts=[OS_HOST], http_compress=True, timeout=60)
|
||
|
||
|
||
# ---- 章節アクセス helpers(PageIndex 退役後の verbatim 取得経路) -------
|
||
|
||
|
||
def list_drug_sections(yj_full: str, *, limit: int = 200) -> list[dict]:
|
||
"""1 薬の全章節を line_num 昇順で返す。
|
||
|
||
各 element: {section_title, line_num, text_len, brand, generic}
|
||
"""
|
||
cli = client()
|
||
resp = cli.search(index=INDEX_NAME, body={
|
||
"size": min(limit, 500),
|
||
"_source": ["section_title", "line_num", "text", "brand_names", "generic_name"],
|
||
"query": {"term": {"yj_full": yj_full}},
|
||
"sort": [{"line_num": "asc"}],
|
||
})
|
||
out = []
|
||
for h in resp["hits"]["hits"]:
|
||
s = h["_source"]
|
||
out.append({
|
||
"section_title": s.get("section_title", ""),
|
||
"line_num": s.get("line_num"),
|
||
"text_len": len(s.get("text", "") or ""),
|
||
"brand": (s.get("brand_names") or [""])[0],
|
||
"generic": s.get("generic_name") or "",
|
||
})
|
||
return out
|
||
|
||
|
||
def get_drug_section_text(yj_full: str, section_title: str) -> str:
|
||
"""指定 (yj_full, section_title) の verbatim 章節 text。見つからなければ ""。"""
|
||
cli = client()
|
||
resp = cli.search(index=INDEX_NAME, body={
|
||
"size": 1,
|
||
"_source": ["text"],
|
||
"query": {"bool": {"must": [
|
||
{"term": {"yj_full": yj_full}},
|
||
{"term": {"section_title.raw": section_title}},
|
||
]}},
|
||
})
|
||
hits = resp["hits"]["hits"]
|
||
if not hits:
|
||
return ""
|
||
return hits[0]["_source"].get("text", "") or ""
|