qwen_agent/skills/support/kfs-answer/scripts/search.py
2026-05-06 19:39:53 +08:00

241 lines
8.4 KiB
Python

"""search_v2 — adds:
R_a: expose L1 (truncated)
R_b: expose per-sheet description with fallback to name/block_titles; cap 5 sheets per file
R_c: secondary sort by rare-keyword hits (rare = appears in <30 files)
"""
import os
import sys
from collections import Counter
import yaml
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from _session import get_session_dir
# Derive project root from script location: scripts/ → kfs-answer/ → skills/ → project root
_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
_ds = os.path.join(_PROJECT_ROOT, "datasets")
DATASETS_DIR = _ds if os.path.isdir(_ds) else os.path.join(_PROJECT_ROOT, "dataset")
RARE_THRESHOLD = 30 # keyword appearing in < N files counts as "rare"
L1_MAX_CHARS = 300
DESC_MAX_CHARS = 80
SHEET_CAP = 5 # per file
def _discover_datasets():
"""Scan DATASETS_DIR for subdirectory names (each is a dataset_id)."""
if not os.path.isdir(DATASETS_DIR):
return []
return [d for d in sorted(os.listdir(DATASETS_DIR))
if os.path.isdir(os.path.join(DATASETS_DIR, d))]
def load_knowledge_files(dataset_ids):
entries = []
for dataset_id in dataset_ids:
dataset_dir = os.path.join(DATASETS_DIR, dataset_id)
if not os.path.isdir(dataset_dir):
continue
for file_id in sorted(os.listdir(dataset_dir)):
file_dir = os.path.join(dataset_dir, file_id)
if not os.path.isdir(file_dir):
continue
km_path = os.path.join(file_dir, "knowledge.md")
if not os.path.isfile(km_path):
continue
try:
with open(km_path, "r", encoding="utf-8") as f:
content = f.read()
if not content.startswith("---"):
continue
parts = content.split("---", 2)
if len(parts) < 3:
continue
meta = yaml.safe_load(parts[1])
if not meta:
continue
entries.append({"dataset_id": dataset_id, "file_id": file_id, "meta": meta})
except Exception:
continue
return entries
def build_searchable_text(entry):
meta = entry["meta"]
parts = [
str(meta.get("L0", "")),
str(meta.get("L1", "")),
str(meta.get("source_name", "")),
]
for sheet in meta.get("sheets", []):
parts.append(str(sheet.get("name", "")))
parts.append(str(sheet.get("description", "")))
for col in sheet.get("columns", []):
parts.append(str(col.get("name", "")))
parts.append(str(col.get("description", "")))
for title in (sheet.get("block_titles") or []):
parts.append(str(title))
return " ".join(parts).lower()
def score_entry(text, keywords):
"""Primary score = hit_ratio (same as v1)."""
hits = sum(1 for k in keywords if k.lower() in text)
return round(hits / max(len(keywords), 1), 3)
def rare_hits(text, keywords, rare_set):
"""Secondary score = # rare keywords that hit."""
return sum(1 for k in keywords if k.lower() in text and k.lower() in rare_set)
MIN_KW_LEN_FOR_DATA_SCAN = 2 # skip single-char keywords to avoid noise
def data_scan_hits(entry, keywords):
"""Scan knowledge.db row data for keyword matches. Returns # keywords found in any row.
Lightweight: one LIKE query per (table, keyword). Skips keywords < MIN_KW_LEN chars.
"""
import sqlite3
db_path = os.path.join(DATASETS_DIR, entry["dataset_id"], entry["file_id"], "knowledge.db")
if not os.path.isfile(db_path):
return 0
scan_kws = [k for k in keywords if len(k) >= MIN_KW_LEN_FOR_DATA_SCAN]
if not scan_kws:
return 0
try:
conn = sqlite3.connect(db_path)
tables = [r[0] for r in conn.execute(
"SELECT name FROM sqlite_master WHERE type='table'").fetchall()]
except Exception:
return 0
hits = set()
for kw in scan_kws:
for table in tables:
try:
cols = [r[1] for r in conn.execute(f'PRAGMA table_info("{table}")').fetchall()
if r[1] != "__src"]
if not cols:
continue
where = " OR ".join(f'"{c}" LIKE ?' for c in cols)
params = [f"%{kw}%"] * len(cols)
cursor = conn.execute(f'SELECT COUNT(*) FROM "{table}" WHERE {where}', params)
if cursor.fetchone()[0] > 0:
hits.add(kw)
break # this kw found, no need to check other tables
except Exception:
continue
conn.close()
return len(hits)
def sheet_label(sheet):
"""R_b: description → name → block_titles[0] → '(untitled)'"""
desc = str(sheet.get("description") or "").strip()
if desc:
return desc[:DESC_MAX_CHARS]
name = str(sheet.get("name") or "").strip()
if name and not name.startswith("sheet_"):
return name
titles = sheet.get("block_titles") or []
if titles:
return str(titles[0])[:DESC_MAX_CHARS]
return name or "(untitled)"
def main():
# Auto-discover datasets from ./dataset/ or ./datasets/ subdirectories
dataset_ids = _discover_datasets()
query = sys.argv[1] if len(sys.argv) > 1 else ""
keywords = sys.argv[2:]
top_n = 20
entries = load_knowledge_files(dataset_ids)
if not entries:
print("NO_MATCH")
return
# R_c: pre-compute which keywords are "rare" across all entries
texts = [(e, build_searchable_text(e)) for e in entries]
doc_freq = Counter()
for _, text in texts:
for k in set(kw.lower() for kw in keywords):
if k in text:
doc_freq[k] += 1
rare_set = {k for k, n in doc_freq.items() if n < RARE_THRESHOLD}
# Score: metadata text match + row data scan
scored = []
for entry, text in texts:
s = score_entry(text, keywords)
r = rare_hits(text, keywords, rare_set)
d = data_scan_hits(entry, keywords)
scored.append({**entry, "score": s, "rare_hits": r, "data_hits": d})
# Primary by score, secondary by data_hits, tertiary by rare_hits
scored.sort(key=lambda x: (-x["score"], -x["data_hits"], -x["rare_hits"]))
matched = [s for s in scored if s["score"] > 0 or s["data_hits"] > 0]
results = (matched or scored)[:top_n]
pairs = []
seen = set()
for r in results:
# Cap sheets per file in RECOMMENDED too (keeps output budget sane)
file_sheets = r["meta"].get("sheets", [])[:SHEET_CAP]
for sheet in file_sheets:
pair = f"{r['file_id']}:{sheet['id']}"
if pair not in seen:
seen.add(pair)
pairs.append(pair)
note = " (keyword matched)" if matched else " (no keyword match, showing all)"
print(f"Total files: {len(entries)}, Returned: {len(results)}{note}")
print()
print(f"RECOMMENDED: {','.join(pairs)}")
print()
# file_refs.txt — written to per-session dir (isolated by TRACE_ID)
refs_path = os.path.join(get_session_dir(), "file_refs.txt")
with open(refs_path, "w", encoding="utf-8") as f:
for idx, r in enumerate(results):
f_code = f"F{idx + 1}"
source_name = r["meta"].get("source_name", "unknown")
print(f'FILE_REF: {f_code}={r["file_id"]}({source_name})')
f.write(f'{f_code}={r["file_id"]}({source_name})\n')
print()
# Per file summary — R_a adds L1, R_b adds per-sheet label, cap sheets
for r in results:
meta = r["meta"]
source_name = meta.get("source_name", "unknown")
l0 = (meta.get("L0", "") or "").replace("\n", " ")[:150]
l1 = (meta.get("L1", "") or "").replace("\n", " ")[:L1_MAX_CHARS]
sheets = meta.get("sheets", [])
shown = sheets[:SHEET_CAP]
extra = len(sheets) - len(shown)
print(f" {r['file_id']} (score:{r['score']},rare:{r['rare_hits']}) {source_name}")
if l0:
print(f" L0: {l0}")
if l1:
print(f" L1: {l1}")
for s in shown:
name = s.get("name", "?")
type_ = s.get("type", "?")
rc = s.get("row_count", s.get("block_count", "?"))
label = sheet_label(s)
print(f" - {name}[{type_},{rc}]: {label}")
if extra > 0:
print(f" - ... ({extra} more sheets omitted)")
print()
if __name__ == "__main__":
main()