"""Budget-aware auto query for knowledge files. Usage: python3 query.py ,... ... Keywords are separate positional arguments (not comma-separated). For db-type sheets: keyword SQL with budget control (COUNT → sample → select columns → LIMIT). For markdown-type sheets: keyword section matching within budget. Output: TSV (or markdown section) followed by a `[CITATIONS]` block with pre-built tags. The `__src` column is consumed internally and stripped from visible output — agent should preserve and place CITATION tags near the data they cite. datasets directory: ./datasets/ (gbase-agent-service) or ./dataset/ (catalog-agent), auto-detected at runtime. dataset_ids are discovered automatically from subdirectories under datasets directory. """ import os import re import sqlite3 import sys import yaml sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from _session import get_session_dir # Derive project root from script location: scripts/ → kfs-answer/ → skills/ → project root _PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) _ds = os.path.join(_PROJECT_ROOT, "datasets") DATASETS_DIR = _ds if os.path.isdir(_ds) else os.path.join(_PROJECT_ROOT, "dataset") def _discover_datasets(): """Scan DATASETS_DIR for subdirectory names (each is a dataset_id).""" if not os.path.isdir(DATASETS_DIR): return [] return [d for d in sorted(os.listdir(DATASETS_DIR)) if os.path.isdir(os.path.join(DATASETS_DIR, d))] def load_file_ref_map(): """Load file_id → (F{n}, filename) mapping from file_refs.txt (in session dir).""" refs_path = os.path.join(get_session_dir(), "file_refs.txt") mapping = {} # file_id → (f_code, filename) if not os.path.isfile(refs_path): return mapping ref_pat = re.compile(r"^(F\d+)=([0-9a-f-]+)\((.+?)\)\s*$") with open(refs_path, "r", encoding="utf-8") as f: for line in f: m = ref_pat.match(line.strip()) if m: mapping[m.group(2)] = (m.group(1), m.group(3)) return mapping # Row-level: cell value stored as `__src="F0S1R5"` (xls-agent-parse wraps it), # so non-anchored + re.search() extracts the triple. Sheet-level comes from # extract_sheet_src which returns bare `F0S1`, so that one stays anchored. SRC_ROW_PAT = re.compile(r"F(\d+)S(\d+)R(\d+)") SRC_SHEET_PAT = re.compile(r"^F(\d+)S(\d+)$") def _format_citation(file_id, sheet_num, filename, row_nums=None): """Build one CITATION tag. file_id=UUID. row_nums=None → sheet-level (no rows attr).""" fn_attr = f' filename="{filename}"' if filename else "" if row_nums is None: return f'' rows_str = "[" + ", ".join(str(r) for r in row_nums) + "]" return f'' def replace_f0(text, f_code): """Replace F0 with assigned f_code (e.g., F1) in __src values.""" if not f_code or f_code == "F0": return text return text.replace('F0S', f'{f_code}S') def extract_sheet_src(body, sheet_id): """Extract __src value from marker. Returns empty string if not found.""" m = re.search(rf'", body) markers = re.findall(r"", body) section = "" for i, marker in enumerate(markers): if marker == sheet_id and i < len(parts) - 1: section = parts[i + 1].strip() break if not section and len(markers) == 0 and len(parts) == 1: section = body.strip() if not section: return {"content": "", "note": "No content found"} if len(section) <= budget: return {"content": section, "full": True} # Keyword-based line matching with context lines = section.split("\n") matched_indices = set() context = 3 for i, line in enumerate(lines): for kw in keywords: if kw and kw.lower() in line.lower(): for j in range(max(0, i - context), min(len(lines), i + context + 1)): matched_indices.add(j) if not matched_indices: truncated = section[:budget] last_nl = truncated.rfind("\n") if last_nl > budget * 0.7: truncated = truncated[:last_nl] return {"content": truncated, "note": f"[No keyword match. First {len(truncated)} chars of {len(section)}]"} result_lines = [] chars = 0 prev = -2 for idx in sorted(matched_indices): line = lines[idx] line_chars = len(line) + 1 if chars + line_chars > budget: break if idx > prev + 1: result_lines.append("---") chars += 4 result_lines.append(line) chars += line_chars prev = idx return {"content": "\n".join(result_lines), "matched_lines": len(result_lines)} def main(): # Auto-discover datasets from ./dataset/ or ./datasets/ subdirectories dataset_ids = _discover_datasets() raw_entries = [e.strip() for e in sys.argv[1].split(",") if e.strip()] question = sys.argv[2] if len(sys.argv) > 2 else "" keywords = sys.argv[3:] # remaining positional args are keywords entries = [] for entry in raw_entries: if ":" in entry: fid, sid = entry.split(":", 1) entries.append((fid.strip(), sid.strip())) else: entries.append((entry.strip(), None)) per_entry_budget = calc_budget(len(entries)) print(f"[Budget: {per_entry_budget} chars/entry, {len(entries)} entries]") # Load F0→F{n} mapping from search.py f_ref_map = load_file_ref_map() # file_id → f_code for fid, target_sheet_id in entries: file_dir = find_file_dir(dataset_ids, fid) if not file_dir: print(f"\n{'='*60}") print(f"file_id: {fid}") print(f" ERROR: not found") continue meta, body = load_knowledge_meta(file_dir) if not meta: print(f"\n{'='*60}") print(f"file_id: {fid}") print(f" ERROR: knowledge.md invalid") continue source_name = meta.get("source_name", "unknown") sheets_meta = {s["id"]: s for s in meta.get("sheets", [])} if target_sheet_id and target_sheet_id in sheets_meta: target_sheets = [(target_sheet_id, sheets_meta[target_sheet_id])] elif target_sheet_id: print(f"\n{'='*60}") print(f"file_id: {fid}, sheet: {target_sheet_id}") print(f" ERROR: sheet not found") continue else: target_sheets = list(sheets_meta.items()) for sid, sheet in target_sheets: stype = sheet.get("type", "unknown") sname = sheet.get("name", "?") print(f"\n{'='*60}") print(f"file_id: {fid} / {sid}: {sname} [{stype}]") print(f"source: {source_name}") if stype == "db": db_path = os.path.join(file_dir, "knowledge.db") db_table = sheet.get("db_table", sid) print(f"db: {db_path}, table: {db_table}") result = query_db_sheet(db_path, db_table, sheet.get("columns", []), keywords, per_entry_budget) if "error" in result: print(f" ERROR: {result['error']}") continue if not result.get("rows"): print(f" No matching rows (total: {result.get('total', 0)})") continue budget_info = [] if result.get("fields_reduced"): budget_info.append(f"fields: {len(result['columns'])}/{len(result['all_columns'])}") if result.get("rows_limited"): budget_info.append(f"rows: {result['count']}/{result['total']}") budget_str = f" [BUDGET: {', '.join(budget_info)}]" if budget_info else "" # Hide __src from both COLUMNS report and TSV display — it's consumed # into CITATION tags below. display_cols = [c for c in result["columns"] if c != "__src"] biz_all_cols = [c for c in result["all_columns"] if c != "__src"] omitted = [c for c in biz_all_cols if c not in display_cols] print(f" TABLE: {result['table']} ({result['count']}/{result['total']} rows){budget_str}") print(f" COLUMNS: {', '.join(display_cols)}") if omitted: print(f" OMITTED: {len(omitted)} columns") if result.get("keyword"): print(f" KEYWORD: {result['keyword']}") # Warn if keyword filtering returned suspiciously few rows if result.get("keyword") and result["count"] <= 3 and result["total"] >= 10: print(f" ⚠ NOTE: keyword \"{result['keyword']}\" matched only {result['count']}/{result['total']} rows. Results may be incomplete — consider removing this keyword.") print("-" * 40) # TSV output — __src stripped, collected into src_groups for CITATIONS. f_entry = f_ref_map.get(fid, ("", "")) f_code, filename = f_entry cols = result["columns"] has_src = "__src" in cols print("\t".join(display_cols)) src_groups = {} # (file_code, sheet_num) -> set of row_nums for row in result["rows"]: if has_src: raw_src = row.get("__src") if raw_src: src_val = str(raw_src) if f_code: src_val = replace_f0(src_val, f_code) m = SRC_ROW_PAT.search(src_val) if m: file_code = f"F{m.group(1)}" sheet_num = int(m.group(2)) row_num = int(m.group(3)) src_groups.setdefault((file_code, sheet_num), set()).add(row_num) vals = [] for c in display_cols: v = row.get(c) s = "" if v is None else str(v) if len(s) > 200: s = s[:200] + "..." vals.append(s) print("\t".join(vals)) if src_groups: import json citations_path = os.path.join(get_session_dir(), "citations.jsonl") with open(citations_path, "a", encoding="utf-8") as cf: for (file_code, sheet_num) in sorted(src_groups.keys()): row_nums = sorted(src_groups[(file_code, sheet_num)]) cf.write(json.dumps({ "file": fid, "filename": filename, "sheet": sheet_num, "rows": row_nums, "source": "query", }, ensure_ascii=False) + "\n") elif stype == "markdown": if not body: print(f" ERROR: no body content") continue src_tag = extract_sheet_src(body, sid) if body else "" f_entry = f_ref_map.get(fid, ("", "")) f_code, filename = f_entry if src_tag and f_code: src_tag = replace_f0(src_tag, f_code) result = query_markdown_sheet(body, sid, keywords, per_entry_budget) if result.get("note"): print(f" {result['note']}") print("-" * 40) print(result.get("content", "")) # Write sheet-level citation to session file (no stdout). if src_tag: m = SRC_SHEET_PAT.match(src_tag) if m: import json sheet_num = int(m.group(2)) citations_path = os.path.join(get_session_dir(), "citations.jsonl") with open(citations_path, "a", encoding="utf-8") as cf: cf.write(json.dumps({ "file": fid, "filename": filename, "sheet": sheet_num, "rows": [], "source": "query", }, ensure_ascii=False) + "\n") else: print(f" ERROR: unknown type '{stype}'") print(f"\n{'='*60}") print(f"Done. Queried {len(entries)} entries.") if __name__ == "__main__": main()