qwen_agent/skills/developing/pmda-drug-info/os_client.py
2026-06-12 11:03:30 +08:00

158 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""OpenSearch `pmda_sections` index spec + client helper.
Mapping 与 wiki-skill 的 sudachi 配置共用 plugin同一 OS 集群、同一 sudachi
core 字典)。每个 doc 对应一份说明书的一个章节节点,冗余存药品 metadata 以避
免 JOIN详见 design.md §2.1.2)。
环境变量:
OS_HOST (默认 http://localhost:9200与 wiki-skill `_common.py` 一致)
PMDA_OS_INDEX (默认 pmda_sections)
"""
from __future__ import annotations
import os
from opensearchpy import OpenSearch
# Plugin env vars: PMDA_OPENSEARCH_URL推奨/ OPENSEARCH_URL / OPENSEARCH_HOST
OS_HOST = (
os.environ.get("PMDA_OPENSEARCH_URL")
or os.environ.get("OPENSEARCH_URL")
or os.environ.get("OPENSEARCH_HOST")
or "http://localhost:9200"
)
INDEX_NAME = os.environ.get("PMDA_OS_INDEX", "pmda_sections")
# ---- Mapping spec --------------------------------------------------------
INDEX_BODY: dict = {
"settings": {
"index": {
"number_of_shards": 1,
"number_of_replicas": 0,
"refresh_interval": "1s",
},
"analysis": {
"tokenizer": {
"sudachi_tokenizer": {
"type": "sudachi_tokenizer",
"split_mode": "C",
"discard_punctuation": True,
}
},
"filter": {
# "med_synonyms": {
# "type": "synonym",
# 初期最小集 — 命中错例后扩充。同义词条之间逗号分隔代表
# 等价、空格视为词内字符。
# "synonyms": [
# "Stevens-Johnson, 皮膚粘膜眼症候群, SJS",
# "中毒性表皮壊死融解症, TEN, ライエル症候群",
# "QT延長, トルサード, Torsades de pointes",
# "間質性肺炎, 肺臓炎",
# "横紋筋融解症, ラブドミオリーシス",
# "アナフィラキシー, アナフィラキシーショック",
# "無顆粒球症, 顆粒球減少症",
# ],
# },
"jp_pos": {
"type": "sudachi_part_of_speech",
},
"jp_stop": {
"type": "sudachi_ja_stop",
},
},
"analyzer": {
"jp_med": {
"type": "custom",
# icu_normalizer はデフォルト image に未含、sudachi_
# normalizedform で全角半角・正規化はカバーされる。
"tokenizer": "sudachi_tokenizer",
"filter": [
"sudachi_baseform",
"sudachi_normalizedform",
"jp_pos",
"jp_stop",
"lowercase",
],
}
},
},
},
"mappings": {
"properties": {
"yj_full": {"type": "keyword"},
"yj_code": {"type": "keyword"},
"l1_code": {"type": "keyword"},
"l2_code": {"type": "keyword"},
"l2_name": {"type": "keyword"},
"category_name": {"type": "keyword"},
"brand_names": {"type": "keyword"},
"generic_name": {"type": "keyword"},
"section_title": {
"type": "text",
"analyzer": "jp_med",
"fields": {"raw": {"type": "keyword"}},
},
"line_num": {"type": "integer"},
"text": {"type": "text", "analyzer": "jp_med"},
"revision_date": {"type": "date"},
"_md_sha256": {"type": "keyword"},
}
},
}
# ---- Client --------------------------------------------------------------
def client() -> OpenSearch:
"""Return an OpenSearch client bound to OS_HOST."""
return OpenSearch(hosts=[OS_HOST], http_compress=True, timeout=60)
# ---- 章節アクセス helpersPageIndex 退役後の verbatim 取得経路) -------
def list_drug_sections(yj_full: str, *, limit: int = 200) -> list[dict]:
"""1 薬の全章節を line_num 昇順で返す。
各 element: {section_title, line_num, text_len, brand, generic}
"""
cli = client()
resp = cli.search(index=INDEX_NAME, body={
"size": min(limit, 500),
"_source": ["section_title", "line_num", "text", "brand_names", "generic_name"],
"query": {"term": {"yj_full": yj_full}},
"sort": [{"line_num": "asc"}],
})
out = []
for h in resp["hits"]["hits"]:
s = h["_source"]
out.append({
"section_title": s.get("section_title", ""),
"line_num": s.get("line_num"),
"text_len": len(s.get("text", "") or ""),
"brand": (s.get("brand_names") or [""])[0],
"generic": s.get("generic_name") or "",
})
return out
def get_drug_section_text(yj_full: str, section_title: str) -> str:
"""指定 (yj_full, section_title) の verbatim 章節 text。見つからなければ """""
cli = client()
resp = cli.search(index=INDEX_NAME, body={
"size": 1,
"_source": ["text"],
"query": {"bool": {"must": [
{"term": {"yj_full": yj_full}},
{"term": {"section_title.raw": section_title}},
]}},
})
hits = resp["hits"]["hits"]
if not hits:
return ""
return hits[0]["_source"].get("text", "") or ""