214 lines
9.0 KiB
Python
Executable File
214 lines
9.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
table-query CLI.
|
|
|
|
Fast, LLM-free table querying. Talks to the felo-mygpt table_query endpoints:
|
|
- search-tables : POST /v1/table_query/search_tables/{bot_id}
|
|
- get-schemas : POST /v1/table_query/get_schemas/{bot_id}
|
|
- run-sql : POST /v1/table_query/run_sql/{bot_id}
|
|
|
|
The agent drives the orchestration (rewrite -> locate -> author SQL -> run);
|
|
the backend only does cheap work, so each call returns in seconds.
|
|
"""
|
|
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import sys
|
|
|
|
try:
|
|
import requests
|
|
except ImportError:
|
|
print("Error: requests module is required. Please install it with: pip install requests")
|
|
sys.exit(1)
|
|
|
|
DEFAULT_BACKEND_HOST = os.getenv("BACKEND_HOST", "https://api-dev.gptbase.ai")
|
|
DEFAULT_MASTERKEY = os.getenv("MASTERKEY", "master")
|
|
|
|
# Same citation contract the legacy table_rag_retrieve used, so the agent's
|
|
# <CITATION ... /> behaviour is unchanged.
|
|
TABLE_CITATION_INSTRUCTIONS = """<CITATION_INSTRUCTIONS>
|
|
When using the retrieved table knowledge below, you MUST add XML citation tags for factual claims.
|
|
|
|
Format: `<CITATION file="file_id" filename="name.xlsx" sheet=1 rows=[2, 4] />`
|
|
- Parse `__src`: `F1S2R5` = file_ref F1, sheet 2, row 5
|
|
- Look up file_id in `file_ref_table`
|
|
- Combine same-sheet rows into one citation: `rows=[2, 4, 6]`
|
|
- MANDATORY: Create SEPARATE citation for EACH (file, sheet) combination
|
|
- NEVER put <CITATION> on the same line as a bullet point or table row
|
|
- Citations MUST be on separate lines AFTER the complete list/table
|
|
- NEVER include the `__src` column in your response - it is internal metadata only
|
|
- Citations MUST appear IMMEDIATELY AFTER the paragraph or bullet list that uses the knowledge
|
|
- NEVER collect all citations and place them at the end of your response
|
|
</CITATION_INSTRUCTIONS>
|
|
"""
|
|
|
|
|
|
def load_config() -> dict:
|
|
"""Load robot_config.json from the robot project root (3 levels up from scripts/)."""
|
|
config_path = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'robot_config.json')
|
|
if os.path.exists(config_path):
|
|
try:
|
|
with open(config_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except (json.JSONDecodeError, IOError) as e:
|
|
print(f"Warning: failed to load robot_config.json: {e}", file=sys.stderr)
|
|
return {}
|
|
|
|
|
|
def _resolve_bot_id(cli_bot_id: str) -> str:
|
|
if cli_bot_id:
|
|
return cli_bot_id
|
|
return load_config().get('bot_id') or os.getenv("BOT_ID") or os.getenv("ASSISTANT_ID")
|
|
|
|
|
|
def _post(path: str, bot_id: str, payload: dict) -> dict:
|
|
url = f"{DEFAULT_BACKEND_HOST}/v1/table_query/{path}/{bot_id}"
|
|
auth_token = hashlib.md5(f"{DEFAULT_MASTERKEY}:{bot_id}".encode()).hexdigest()
|
|
headers = {
|
|
"content-type": "application/json",
|
|
"authorization": f"Bearer {auth_token}",
|
|
}
|
|
trace_id = os.getenv("TRACE_ID") or os.getenv("X_REQUEST_ID")
|
|
if trace_id:
|
|
headers["X-Request-ID"] = trace_id
|
|
resp = requests.post(url, json=payload, headers=headers, timeout=30)
|
|
if resp.status_code != 200:
|
|
raise RuntimeError(f"API {path} returned {resp.status_code}: {resp.text}")
|
|
return resp.json()
|
|
|
|
|
|
def cmd_search_tables(args, bot_id: str) -> str:
|
|
res = _post("search_tables", bot_id, {"query": args.query, "top_k": args.top_k})
|
|
tables = res.get("tables", [])
|
|
if not tables:
|
|
return ("No matching tables found. If the question may be answered from documents "
|
|
"instead of spreadsheets, fall back to the rag_retrieve document tool.")
|
|
lines = [f"Found {len(tables)} candidate table(s). Pick the relevant ones and call "
|
|
f"`get-schemas` for them next.\n"]
|
|
for t in tables:
|
|
lines.append(
|
|
f"- table_name: {t['table_name']}\n"
|
|
f" file: {t.get('file_name','')} | sheet: {t.get('sheet_name','')} "
|
|
f"| score: {round(t.get('score', 0), 3)}\n"
|
|
f" description: {t.get('table_description','')}"
|
|
)
|
|
return "\n".join(lines)
|
|
|
|
|
|
def cmd_get_schemas(args, bot_id: str) -> str:
|
|
table_names = [t.strip() for t in args.tables.split(',') if t.strip()]
|
|
res = _post("get_schemas", bot_id,
|
|
{"table_names": table_names, "sample_rows": args.sample_rows})
|
|
schemas = res.get("schemas", [])
|
|
missing = res.get("missing_tables", [])
|
|
if not schemas:
|
|
return f"No schemas resolved. Missing tables: {missing}"
|
|
blocks = []
|
|
for s in schemas:
|
|
block = [f"### Table: {s['table_name']}",
|
|
f"File: {s.get('file_name','')} | Sheet: {s.get('sheet_name','')}",
|
|
"```sql", s.get('sql_create', ''), "```"]
|
|
sample = s.get('sample_rows') or []
|
|
if sample:
|
|
block.append("Sample rows (format hint only, NOT the row count):")
|
|
block.append("```csv")
|
|
for row in sample:
|
|
block.append(",".join('"' + str(c).replace('"', '""') + '"' for c in row))
|
|
block.append("```")
|
|
blocks.append("\n".join(block))
|
|
out = "\n\n".join(blocks)
|
|
if missing:
|
|
out += f"\n\nNote: these requested tables were not found: {missing}"
|
|
out += ("\n\nNow author a SQLite plan and run it by piping the JSON to run-sql on stdin:\n"
|
|
" run-sql <<'PLAN'\n"
|
|
" {\"queries\": [{\"step\": 1, \"sql\": \"CREATE TEMP TABLE \\\"final_table_step1\\\" "
|
|
"AS SELECT ...\", \"source_table_names\": [\"...\"], "
|
|
"\"destine_table_name\": \"final_table_step1\", \"destine_table_type\": \"final\"}]}\n"
|
|
" PLAN\n"
|
|
"Quote all identifiers with double quotes.")
|
|
return out
|
|
|
|
|
|
def cmd_run_sql(args, bot_id: str) -> str:
|
|
# Read the plan from --plan-file if given, otherwise from stdin (heredoc).
|
|
try:
|
|
if args.plan_file:
|
|
with open(args.plan_file, 'r', encoding='utf-8') as f:
|
|
raw = f.read()
|
|
else:
|
|
raw = sys.stdin.read()
|
|
if not raw.strip():
|
|
return ("Error: no plan provided. Pipe the JSON plan via stdin, e.g.\n"
|
|
" python scripts/table_query.py run-sql <<'PLAN'\n"
|
|
" {\"queries\": [...]}\n"
|
|
" PLAN")
|
|
plan = json.loads(raw)
|
|
except (json.JSONDecodeError, IOError) as e:
|
|
return f"Error: failed to read SQL plan: {e}"
|
|
# accept either {"queries": [...]} or a bare [...] list
|
|
queries = plan.get("queries") if isinstance(plan, dict) else plan
|
|
if not queries:
|
|
return "Error: the plan must contain a non-empty `queries` list."
|
|
payload = {"queries": queries}
|
|
if args.max_rows is not None:
|
|
payload["max_rows"] = args.max_rows
|
|
if args.cell_max is not None:
|
|
payload["cell_max"] = args.cell_max
|
|
res = _post("run_sql", bot_id, payload)
|
|
if not res.get("success"):
|
|
return (f"SQL execution failed: {res.get('error')}\n"
|
|
"Fix your SQL and call run-sql again. Do NOT restart from search-tables.")
|
|
parts = [TABLE_CITATION_INSTRUCTIONS]
|
|
if res.get("instruction"):
|
|
parts.append(res["instruction"])
|
|
if res.get("knowledge"):
|
|
parts.append(res["knowledge"])
|
|
if res.get("extra_goal"):
|
|
parts.append(res["extra_goal"])
|
|
return "\n".join(parts)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="table-query: fast LLM-free table querying")
|
|
parser.add_argument("--bot-id", default=None, help="Bot id (defaults to robot_config.json)")
|
|
sub = parser.add_subparsers(dest="command", required=True)
|
|
|
|
p_search = sub.add_parser("search-tables", help="Vector-locate relevant tables")
|
|
p_search.add_argument("--query", "-q", required=True, help="Rewritten retrieval query")
|
|
p_search.add_argument("--top-k", "-k", type=int, default=20)
|
|
|
|
p_schemas = sub.add_parser("get-schemas", help="Fetch CREATE TABLE schema + sample rows")
|
|
p_schemas.add_argument("--tables", "-t", required=True, help="Comma-separated table names")
|
|
p_schemas.add_argument("--sample-rows", type=int, default=3)
|
|
|
|
p_run = sub.add_parser("run-sql", help="Execute an authored SQL plan (JSON via stdin or file)")
|
|
p_run.add_argument("--plan-file", "-f", default=None,
|
|
help="Path to plan JSON file (optional; defaults to reading stdin)")
|
|
p_run.add_argument("--max-rows", type=int, default=None,
|
|
help="Max total result rows (raise if a result came back truncated)")
|
|
p_run.add_argument("--cell-max", type=int, default=None,
|
|
help="Max characters per cell before truncation")
|
|
|
|
args = parser.parse_args()
|
|
bot_id = _resolve_bot_id(args.bot_id)
|
|
if not bot_id:
|
|
print("Error: bot_id is required (robot_config.json / --bot-id / BOT_ID env)")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
if args.command == "search-tables":
|
|
print(cmd_search_tables(args, bot_id))
|
|
elif args.command == "get-schemas":
|
|
print(cmd_get_schemas(args, bot_id))
|
|
elif args.command == "run-sql":
|
|
print(cmd_run_sql(args, bot_id))
|
|
except Exception as e:
|
|
print(f"Error: {str(e)}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|