241 lines
8.4 KiB
Python
241 lines
8.4 KiB
Python
"""search_v2 — adds:
|
|
R_a: expose L1 (truncated)
|
|
R_b: expose per-sheet description with fallback to name/block_titles; cap 5 sheets per file
|
|
R_c: secondary sort by rare-keyword hits (rare = appears in <30 files)
|
|
"""
|
|
import os
|
|
import sys
|
|
from collections import Counter
|
|
|
|
import yaml
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
from _session import get_session_dir
|
|
|
|
# Derive project root from script location: scripts/ → kfs-answer/ → skills/ → project root
|
|
_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
|
_ds = os.path.join(_PROJECT_ROOT, "datasets")
|
|
DATASETS_DIR = _ds if os.path.isdir(_ds) else os.path.join(_PROJECT_ROOT, "dataset")
|
|
RARE_THRESHOLD = 30 # keyword appearing in < N files counts as "rare"
|
|
L1_MAX_CHARS = 300
|
|
DESC_MAX_CHARS = 80
|
|
SHEET_CAP = 5 # per file
|
|
|
|
|
|
def _discover_datasets():
|
|
"""Scan DATASETS_DIR for subdirectory names (each is a dataset_id)."""
|
|
if not os.path.isdir(DATASETS_DIR):
|
|
return []
|
|
return [d for d in sorted(os.listdir(DATASETS_DIR))
|
|
if os.path.isdir(os.path.join(DATASETS_DIR, d))]
|
|
|
|
|
|
def load_knowledge_files(dataset_ids):
|
|
entries = []
|
|
for dataset_id in dataset_ids:
|
|
dataset_dir = os.path.join(DATASETS_DIR, dataset_id)
|
|
if not os.path.isdir(dataset_dir):
|
|
continue
|
|
for file_id in sorted(os.listdir(dataset_dir)):
|
|
file_dir = os.path.join(dataset_dir, file_id)
|
|
if not os.path.isdir(file_dir):
|
|
continue
|
|
km_path = os.path.join(file_dir, "knowledge.md")
|
|
if not os.path.isfile(km_path):
|
|
continue
|
|
try:
|
|
with open(km_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
if not content.startswith("---"):
|
|
continue
|
|
parts = content.split("---", 2)
|
|
if len(parts) < 3:
|
|
continue
|
|
meta = yaml.safe_load(parts[1])
|
|
if not meta:
|
|
continue
|
|
entries.append({"dataset_id": dataset_id, "file_id": file_id, "meta": meta})
|
|
except Exception:
|
|
continue
|
|
return entries
|
|
|
|
|
|
def build_searchable_text(entry):
|
|
meta = entry["meta"]
|
|
parts = [
|
|
str(meta.get("L0", "")),
|
|
str(meta.get("L1", "")),
|
|
str(meta.get("source_name", "")),
|
|
]
|
|
for sheet in meta.get("sheets", []):
|
|
parts.append(str(sheet.get("name", "")))
|
|
parts.append(str(sheet.get("description", "")))
|
|
for col in sheet.get("columns", []):
|
|
parts.append(str(col.get("name", "")))
|
|
parts.append(str(col.get("description", "")))
|
|
for title in (sheet.get("block_titles") or []):
|
|
parts.append(str(title))
|
|
return " ".join(parts).lower()
|
|
|
|
|
|
def score_entry(text, keywords):
|
|
"""Primary score = hit_ratio (same as v1)."""
|
|
hits = sum(1 for k in keywords if k.lower() in text)
|
|
return round(hits / max(len(keywords), 1), 3)
|
|
|
|
|
|
def rare_hits(text, keywords, rare_set):
|
|
"""Secondary score = # rare keywords that hit."""
|
|
return sum(1 for k in keywords if k.lower() in text and k.lower() in rare_set)
|
|
|
|
|
|
MIN_KW_LEN_FOR_DATA_SCAN = 2 # skip single-char keywords to avoid noise
|
|
|
|
|
|
def data_scan_hits(entry, keywords):
|
|
"""Scan knowledge.db row data for keyword matches. Returns # keywords found in any row.
|
|
|
|
Lightweight: one LIKE query per (table, keyword). Skips keywords < MIN_KW_LEN chars.
|
|
"""
|
|
import sqlite3
|
|
|
|
db_path = os.path.join(DATASETS_DIR, entry["dataset_id"], entry["file_id"], "knowledge.db")
|
|
if not os.path.isfile(db_path):
|
|
return 0
|
|
|
|
scan_kws = [k for k in keywords if len(k) >= MIN_KW_LEN_FOR_DATA_SCAN]
|
|
if not scan_kws:
|
|
return 0
|
|
|
|
try:
|
|
conn = sqlite3.connect(db_path)
|
|
tables = [r[0] for r in conn.execute(
|
|
"SELECT name FROM sqlite_master WHERE type='table'").fetchall()]
|
|
except Exception:
|
|
return 0
|
|
|
|
hits = set()
|
|
for kw in scan_kws:
|
|
for table in tables:
|
|
try:
|
|
cols = [r[1] for r in conn.execute(f'PRAGMA table_info("{table}")').fetchall()
|
|
if r[1] != "__src"]
|
|
if not cols:
|
|
continue
|
|
where = " OR ".join(f'"{c}" LIKE ?' for c in cols)
|
|
params = [f"%{kw}%"] * len(cols)
|
|
cursor = conn.execute(f'SELECT COUNT(*) FROM "{table}" WHERE {where}', params)
|
|
if cursor.fetchone()[0] > 0:
|
|
hits.add(kw)
|
|
break # this kw found, no need to check other tables
|
|
except Exception:
|
|
continue
|
|
conn.close()
|
|
return len(hits)
|
|
|
|
|
|
def sheet_label(sheet):
|
|
"""R_b: description → name → block_titles[0] → '(untitled)'"""
|
|
desc = str(sheet.get("description") or "").strip()
|
|
if desc:
|
|
return desc[:DESC_MAX_CHARS]
|
|
name = str(sheet.get("name") or "").strip()
|
|
if name and not name.startswith("sheet_"):
|
|
return name
|
|
titles = sheet.get("block_titles") or []
|
|
if titles:
|
|
return str(titles[0])[:DESC_MAX_CHARS]
|
|
return name or "(untitled)"
|
|
|
|
|
|
def main():
|
|
# Auto-discover datasets from ./dataset/ or ./datasets/ subdirectories
|
|
dataset_ids = _discover_datasets()
|
|
query = sys.argv[1] if len(sys.argv) > 1 else ""
|
|
keywords = sys.argv[2:]
|
|
top_n = 20
|
|
|
|
entries = load_knowledge_files(dataset_ids)
|
|
if not entries:
|
|
print("NO_MATCH")
|
|
return
|
|
|
|
# R_c: pre-compute which keywords are "rare" across all entries
|
|
texts = [(e, build_searchable_text(e)) for e in entries]
|
|
doc_freq = Counter()
|
|
for _, text in texts:
|
|
for k in set(kw.lower() for kw in keywords):
|
|
if k in text:
|
|
doc_freq[k] += 1
|
|
rare_set = {k for k, n in doc_freq.items() if n < RARE_THRESHOLD}
|
|
|
|
# Score: metadata text match + row data scan
|
|
scored = []
|
|
for entry, text in texts:
|
|
s = score_entry(text, keywords)
|
|
r = rare_hits(text, keywords, rare_set)
|
|
d = data_scan_hits(entry, keywords)
|
|
scored.append({**entry, "score": s, "rare_hits": r, "data_hits": d})
|
|
|
|
# Primary by score, secondary by data_hits, tertiary by rare_hits
|
|
scored.sort(key=lambda x: (-x["score"], -x["data_hits"], -x["rare_hits"]))
|
|
|
|
matched = [s for s in scored if s["score"] > 0 or s["data_hits"] > 0]
|
|
results = (matched or scored)[:top_n]
|
|
|
|
pairs = []
|
|
seen = set()
|
|
for r in results:
|
|
# Cap sheets per file in RECOMMENDED too (keeps output budget sane)
|
|
file_sheets = r["meta"].get("sheets", [])[:SHEET_CAP]
|
|
for sheet in file_sheets:
|
|
pair = f"{r['file_id']}:{sheet['id']}"
|
|
if pair not in seen:
|
|
seen.add(pair)
|
|
pairs.append(pair)
|
|
|
|
note = " (keyword matched)" if matched else " (no keyword match, showing all)"
|
|
print(f"Total files: {len(entries)}, Returned: {len(results)}{note}")
|
|
print()
|
|
print(f"RECOMMENDED: {','.join(pairs)}")
|
|
print()
|
|
|
|
# file_refs.txt — written to per-session dir (isolated by TRACE_ID)
|
|
refs_path = os.path.join(get_session_dir(), "file_refs.txt")
|
|
with open(refs_path, "w", encoding="utf-8") as f:
|
|
for idx, r in enumerate(results):
|
|
f_code = f"F{idx + 1}"
|
|
source_name = r["meta"].get("source_name", "unknown")
|
|
print(f'FILE_REF: {f_code}={r["file_id"]}({source_name})')
|
|
f.write(f'{f_code}={r["file_id"]}({source_name})\n')
|
|
print()
|
|
|
|
# Per file summary — R_a adds L1, R_b adds per-sheet label, cap sheets
|
|
for r in results:
|
|
meta = r["meta"]
|
|
source_name = meta.get("source_name", "unknown")
|
|
l0 = (meta.get("L0", "") or "").replace("\n", " ")[:150]
|
|
l1 = (meta.get("L1", "") or "").replace("\n", " ")[:L1_MAX_CHARS]
|
|
sheets = meta.get("sheets", [])
|
|
shown = sheets[:SHEET_CAP]
|
|
extra = len(sheets) - len(shown)
|
|
|
|
print(f" {r['file_id']} (score:{r['score']},rare:{r['rare_hits']}) {source_name}")
|
|
if l0:
|
|
print(f" L0: {l0}")
|
|
if l1:
|
|
print(f" L1: {l1}")
|
|
for s in shown:
|
|
name = s.get("name", "?")
|
|
type_ = s.get("type", "?")
|
|
rc = s.get("row_count", s.get("block_count", "?"))
|
|
label = sheet_label(s)
|
|
print(f" - {name}[{type_},{rc}]: {label}")
|
|
if extra > 0:
|
|
print(f" - ... ({extra} more sheets omitted)")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|