"""search_v2 — adds: R_a: expose L1 (truncated) R_b: expose per-sheet description with fallback to name/block_titles; cap 5 sheets per file R_c: secondary sort by rare-keyword hits (rare = appears in <30 files) """ import os import sys from collections import Counter import yaml sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from _session import get_session_dir # Derive project root from script location: scripts/ → kfs-answer/ → skills/ → project root _PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) _ds = os.path.join(_PROJECT_ROOT, "datasets") DATASETS_DIR = _ds if os.path.isdir(_ds) else os.path.join(_PROJECT_ROOT, "dataset") RARE_THRESHOLD = 30 # keyword appearing in < N files counts as "rare" L1_MAX_CHARS = 300 DESC_MAX_CHARS = 80 SHEET_CAP = 5 # per file def _discover_datasets(): """Scan DATASETS_DIR for subdirectory names (each is a dataset_id).""" if not os.path.isdir(DATASETS_DIR): return [] return [d for d in sorted(os.listdir(DATASETS_DIR)) if os.path.isdir(os.path.join(DATASETS_DIR, d))] def load_knowledge_files(dataset_ids): entries = [] for dataset_id in dataset_ids: dataset_dir = os.path.join(DATASETS_DIR, dataset_id) if not os.path.isdir(dataset_dir): continue for file_id in sorted(os.listdir(dataset_dir)): file_dir = os.path.join(dataset_dir, file_id) if not os.path.isdir(file_dir): continue km_path = os.path.join(file_dir, "knowledge.md") if not os.path.isfile(km_path): continue try: with open(km_path, "r", encoding="utf-8") as f: content = f.read() if not content.startswith("---"): continue parts = content.split("---", 2) if len(parts) < 3: continue meta = yaml.safe_load(parts[1]) if not meta: continue entries.append({"dataset_id": dataset_id, "file_id": file_id, "meta": meta}) except Exception: continue return entries def build_searchable_text(entry): meta = entry["meta"] parts = [ str(meta.get("L0", "")), str(meta.get("L1", "")), str(meta.get("source_name", "")), ] for sheet in meta.get("sheets", []): parts.append(str(sheet.get("name", ""))) parts.append(str(sheet.get("description", ""))) for col in sheet.get("columns", []): parts.append(str(col.get("name", ""))) parts.append(str(col.get("description", ""))) for title in (sheet.get("block_titles") or []): parts.append(str(title)) return " ".join(parts).lower() def score_entry(text, keywords): """Primary score = hit_ratio (same as v1).""" hits = sum(1 for k in keywords if k.lower() in text) return round(hits / max(len(keywords), 1), 3) def rare_hits(text, keywords, rare_set): """Secondary score = # rare keywords that hit.""" return sum(1 for k in keywords if k.lower() in text and k.lower() in rare_set) MIN_KW_LEN_FOR_DATA_SCAN = 2 # skip single-char keywords to avoid noise def data_scan_hits(entry, keywords): """Scan knowledge.db row data for keyword matches. Returns # keywords found in any row. Lightweight: one LIKE query per (table, keyword). Skips keywords < MIN_KW_LEN chars. """ import sqlite3 db_path = os.path.join(DATASETS_DIR, entry["dataset_id"], entry["file_id"], "knowledge.db") if not os.path.isfile(db_path): return 0 scan_kws = [k for k in keywords if len(k) >= MIN_KW_LEN_FOR_DATA_SCAN] if not scan_kws: return 0 try: conn = sqlite3.connect(db_path) tables = [r[0] for r in conn.execute( "SELECT name FROM sqlite_master WHERE type='table'").fetchall()] except Exception: return 0 hits = set() for kw in scan_kws: for table in tables: try: cols = [r[1] for r in conn.execute(f'PRAGMA table_info("{table}")').fetchall() if r[1] != "__src"] if not cols: continue where = " OR ".join(f'"{c}" LIKE ?' for c in cols) params = [f"%{kw}%"] * len(cols) cursor = conn.execute(f'SELECT COUNT(*) FROM "{table}" WHERE {where}', params) if cursor.fetchone()[0] > 0: hits.add(kw) break # this kw found, no need to check other tables except Exception: continue conn.close() return len(hits) def sheet_label(sheet): """R_b: description → name → block_titles[0] → '(untitled)'""" desc = str(sheet.get("description") or "").strip() if desc: return desc[:DESC_MAX_CHARS] name = str(sheet.get("name") or "").strip() if name and not name.startswith("sheet_"): return name titles = sheet.get("block_titles") or [] if titles: return str(titles[0])[:DESC_MAX_CHARS] return name or "(untitled)" def main(): # Auto-discover datasets from ./dataset/ or ./datasets/ subdirectories dataset_ids = _discover_datasets() query = sys.argv[1] if len(sys.argv) > 1 else "" keywords = sys.argv[2:] top_n = 20 entries = load_knowledge_files(dataset_ids) if not entries: print("NO_MATCH") return # R_c: pre-compute which keywords are "rare" across all entries texts = [(e, build_searchable_text(e)) for e in entries] doc_freq = Counter() for _, text in texts: for k in set(kw.lower() for kw in keywords): if k in text: doc_freq[k] += 1 rare_set = {k for k, n in doc_freq.items() if n < RARE_THRESHOLD} # Score: metadata text match + row data scan scored = [] for entry, text in texts: s = score_entry(text, keywords) r = rare_hits(text, keywords, rare_set) d = data_scan_hits(entry, keywords) scored.append({**entry, "score": s, "rare_hits": r, "data_hits": d}) # Primary by score, secondary by data_hits, tertiary by rare_hits scored.sort(key=lambda x: (-x["score"], -x["data_hits"], -x["rare_hits"])) matched = [s for s in scored if s["score"] > 0 or s["data_hits"] > 0] results = (matched or scored)[:top_n] pairs = [] seen = set() for r in results: # Cap sheets per file in RECOMMENDED too (keeps output budget sane) file_sheets = r["meta"].get("sheets", [])[:SHEET_CAP] for sheet in file_sheets: pair = f"{r['file_id']}:{sheet['id']}" if pair not in seen: seen.add(pair) pairs.append(pair) note = " (keyword matched)" if matched else " (no keyword match, showing all)" print(f"Total files: {len(entries)}, Returned: {len(results)}{note}") print() print(f"RECOMMENDED: {','.join(pairs)}") print() # file_refs.txt — written to per-session dir (isolated by TRACE_ID) refs_path = os.path.join(get_session_dir(), "file_refs.txt") with open(refs_path, "w", encoding="utf-8") as f: for idx, r in enumerate(results): f_code = f"F{idx + 1}" source_name = r["meta"].get("source_name", "unknown") print(f'FILE_REF: {f_code}={r["file_id"]}({source_name})') f.write(f'{f_code}={r["file_id"]}({source_name})\n') print() # Per file summary — R_a adds L1, R_b adds per-sheet label, cap sheets for r in results: meta = r["meta"] source_name = meta.get("source_name", "unknown") l0 = (meta.get("L0", "") or "").replace("\n", " ")[:150] l1 = (meta.get("L1", "") or "").replace("\n", " ")[:L1_MAX_CHARS] sheets = meta.get("sheets", []) shown = sheets[:SHEET_CAP] extra = len(sheets) - len(shown) print(f" {r['file_id']} (score:{r['score']},rare:{r['rare_hits']}) {source_name}") if l0: print(f" L0: {l0}") if l1: print(f" L1: {l1}") for s in shown: name = s.get("name", "?") type_ = s.get("type", "?") rc = s.get("row_count", s.get("block_count", "?")) label = sheet_label(s) print(f" - {name}[{type_},{rc}]: {label}") if extra > 0: print(f" - ... ({extra} more sheets omitted)") print() if __name__ == "__main__": main()