#!/usr/bin/env python3 """ table-query CLI. Fast, LLM-free table querying. Talks to the felo-mygpt table_query endpoints: - search-tables : POST /v1/table_query/search_tables/{bot_id} - get-schemas : POST /v1/table_query/get_schemas/{bot_id} - run-sql : POST /v1/table_query/run_sql/{bot_id} The agent drives the orchestration (rewrite -> locate -> author SQL -> run); the backend only does cheap work, so each call returns in seconds. """ import argparse import hashlib import json import os import sys try: import requests except ImportError: print("Error: requests module is required. Please install it with: pip install requests") sys.exit(1) DEFAULT_BACKEND_HOST = os.getenv("BACKEND_HOST", "https://api-dev.gptbase.ai") DEFAULT_MASTERKEY = os.getenv("MASTERKEY", "master") # Same citation contract the legacy table_rag_retrieve used, so the agent's # behaviour is unchanged. TABLE_CITATION_INSTRUCTIONS = """ When using the retrieved table knowledge below, you MUST add XML citation tags for factual claims. Format: `` - Parse `__src`: `F1S2R5` = file_ref F1, sheet 2, row 5 - Look up file_id in `file_ref_table` - Combine same-sheet rows into one citation: `rows=[2, 4, 6]` - MANDATORY: Create SEPARATE citation for EACH (file, sheet) combination - NEVER put on the same line as a bullet point or table row - Citations MUST be on separate lines AFTER the complete list/table - NEVER include the `__src` column in your response - it is internal metadata only - Citations MUST appear IMMEDIATELY AFTER the paragraph or bullet list that uses the knowledge - NEVER collect all citations and place them at the end of your response """ def load_config() -> dict: """Load robot_config.json from the robot project root (3 levels up from scripts/).""" config_path = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'robot_config.json') if os.path.exists(config_path): try: with open(config_path, 'r', encoding='utf-8') as f: return json.load(f) except (json.JSONDecodeError, IOError) as e: print(f"Warning: failed to load robot_config.json: {e}", file=sys.stderr) return {} def _resolve_bot_id(cli_bot_id: str) -> str: if cli_bot_id: return cli_bot_id return load_config().get('bot_id') or os.getenv("BOT_ID") or os.getenv("ASSISTANT_ID") def _post(path: str, bot_id: str, payload: dict) -> dict: url = f"{DEFAULT_BACKEND_HOST}/v1/table_query/{path}/{bot_id}" auth_token = hashlib.md5(f"{DEFAULT_MASTERKEY}:{bot_id}".encode()).hexdigest() headers = { "content-type": "application/json", "authorization": f"Bearer {auth_token}", } trace_id = os.getenv("TRACE_ID") or os.getenv("X_REQUEST_ID") if trace_id: headers["X-Request-ID"] = trace_id resp = requests.post(url, json=payload, headers=headers, timeout=30) if resp.status_code != 200: raise RuntimeError(f"API {path} returned {resp.status_code}: {resp.text}") return resp.json() def cmd_search_tables(args, bot_id: str) -> str: res = _post("search_tables", bot_id, {"query": args.query, "top_k": args.top_k}) tables = res.get("tables", []) if not tables: return ("No matching tables found. If the question may be answered from documents " "instead of spreadsheets, fall back to the rag_retrieve document tool.") lines = [f"Found {len(tables)} candidate table(s). Pick the relevant ones and call " f"`get-schemas` for them next.\n"] for t in tables: lines.append( f"- table_name: {t['table_name']}\n" f" file: {t.get('file_name','')} | sheet: {t.get('sheet_name','')} " f"| score: {round(t.get('score', 0), 3)}\n" f" description: {t.get('table_description','')}" ) return "\n".join(lines) def cmd_get_schemas(args, bot_id: str) -> str: table_names = [t.strip() for t in args.tables.split(',') if t.strip()] res = _post("get_schemas", bot_id, {"table_names": table_names, "sample_rows": args.sample_rows}) schemas = res.get("schemas", []) missing = res.get("missing_tables", []) if not schemas: return f"No schemas resolved. Missing tables: {missing}" blocks = [] for s in schemas: block = [f"### Table: {s['table_name']}", f"File: {s.get('file_name','')} | Sheet: {s.get('sheet_name','')}", "```sql", s.get('sql_create', ''), "```"] sample = s.get('sample_rows') or [] if sample: block.append("Sample rows (format hint only, NOT the row count):") block.append("```csv") for row in sample: block.append(",".join('"' + str(c).replace('"', '""') + '"' for c in row)) block.append("```") blocks.append("\n".join(block)) out = "\n\n".join(blocks) if missing: out += f"\n\nNote: these requested tables were not found: {missing}" out += ("\n\nNow author a SQLite plan and run it by piping the JSON to run-sql on stdin:\n" " run-sql <<'PLAN'\n" " {\"queries\": [{\"step\": 1, \"sql\": \"CREATE TEMP TABLE \\\"final_table_step1\\\" " "AS SELECT ...\", \"source_table_names\": [\"...\"], " "\"destine_table_name\": \"final_table_step1\", \"destine_table_type\": \"final\"}]}\n" " PLAN\n" "Quote all identifiers with double quotes.") return out def cmd_run_sql(args, bot_id: str) -> str: # Read the plan from --plan-file if given, otherwise from stdin (heredoc). try: if args.plan_file: with open(args.plan_file, 'r', encoding='utf-8') as f: raw = f.read() else: raw = sys.stdin.read() if not raw.strip(): return ("Error: no plan provided. Pipe the JSON plan via stdin, e.g.\n" " python scripts/table_query.py run-sql <<'PLAN'\n" " {\"queries\": [...]}\n" " PLAN") plan = json.loads(raw) except (json.JSONDecodeError, IOError) as e: return f"Error: failed to read SQL plan: {e}" # accept either {"queries": [...]} or a bare [...] list queries = plan.get("queries") if isinstance(plan, dict) else plan if not queries: return "Error: the plan must contain a non-empty `queries` list." payload = {"queries": queries} if args.max_rows is not None: payload["max_rows"] = args.max_rows if args.cell_max is not None: payload["cell_max"] = args.cell_max res = _post("run_sql", bot_id, payload) if not res.get("success"): return (f"SQL execution failed: {res.get('error')}\n" "Fix your SQL and call run-sql again. Do NOT restart from search-tables.") parts = [TABLE_CITATION_INSTRUCTIONS] if res.get("instruction"): parts.append(res["instruction"]) if res.get("knowledge"): parts.append(res["knowledge"]) if res.get("extra_goal"): parts.append(res["extra_goal"]) return "\n".join(parts) def main(): parser = argparse.ArgumentParser(description="table-query: fast LLM-free table querying") parser.add_argument("--bot-id", default=None, help="Bot id (defaults to robot_config.json)") sub = parser.add_subparsers(dest="command", required=True) p_search = sub.add_parser("search-tables", help="Vector-locate relevant tables") p_search.add_argument("--query", "-q", required=True, help="Rewritten retrieval query") p_search.add_argument("--top-k", "-k", type=int, default=20) p_schemas = sub.add_parser("get-schemas", help="Fetch CREATE TABLE schema + sample rows") p_schemas.add_argument("--tables", "-t", required=True, help="Comma-separated table names") p_schemas.add_argument("--sample-rows", type=int, default=3) p_run = sub.add_parser("run-sql", help="Execute an authored SQL plan (JSON via stdin or file)") p_run.add_argument("--plan-file", "-f", default=None, help="Path to plan JSON file (optional; defaults to reading stdin)") p_run.add_argument("--max-rows", type=int, default=None, help="Max total result rows (raise if a result came back truncated)") p_run.add_argument("--cell-max", type=int, default=None, help="Max characters per cell before truncation") args = parser.parse_args() bot_id = _resolve_bot_id(args.bot_id) if not bot_id: print("Error: bot_id is required (robot_config.json / --bot-id / BOT_ID env)") sys.exit(1) try: if args.command == "search-tables": print(cmd_search_tables(args, bot_id)) elif args.command == "get-schemas": print(cmd_get_schemas(args, bot_id)) elif args.command == "run-sql": print(cmd_run_sql(args, bot_id)) except Exception as e: print(f"Error: {str(e)}") sys.exit(1) if __name__ == "__main__": main()