qwen_agent/skills/developing/table-query/scripts/table_query.py

#!/usr/bin/env python3
"""
table-query CLI.

Fast, LLM-free table querying. Talks to the felo-mygpt table_query endpoints:
  - search-tables : POST /v1/table_query/search_tables/{bot_id}
  - get-schemas   : POST /v1/table_query/get_schemas/{bot_id}
  - run-sql       : POST /v1/table_query/run_sql/{bot_id}

The agent drives the orchestration (rewrite -> locate -> author SQL -> run);
the backend only does cheap work, so each call returns in seconds.
"""

import argparse
import hashlib
import json
import os
import sys

try:
    import requests
except ImportError:
    print("Error: requests module is required. Please install it with: pip install requests")
    sys.exit(1)

DEFAULT_BACKEND_HOST = os.getenv("BACKEND_HOST", "https://api-dev.gptbase.ai")
DEFAULT_MASTERKEY = os.getenv("MASTERKEY", "master")

# Same citation contract the legacy table_rag_retrieve used, so the agent's
# <CITATION ... /> behaviour is unchanged.
TABLE_CITATION_INSTRUCTIONS = """<CITATION_INSTRUCTIONS>
When using the retrieved table knowledge below, you MUST add XML citation tags for factual claims.

Format: `<CITATION file="file_id" filename="name.xlsx" sheet=1 rows=[2, 4] />`
- Parse `__src`: `F1S2R5` = file_ref F1, sheet 2, row 5
- Look up file_id in `file_ref_table`
- Combine same-sheet rows into one citation: `rows=[2, 4, 6]`
- MANDATORY: Create SEPARATE citation for EACH (file, sheet) combination
- NEVER put <CITATION> on the same line as a bullet point or table row
- Citations MUST be on separate lines AFTER the complete list/table
- NEVER include the `__src` column in your response - it is internal metadata only
- Citations MUST appear IMMEDIATELY AFTER the paragraph or bullet list that uses the knowledge
- NEVER collect all citations and place them at the end of your response
</CITATION_INSTRUCTIONS>
"""


def load_config() -> dict:
    """Load robot_config.json from the robot project root (3 levels up from scripts/)."""
    config_path = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'robot_config.json')
    if os.path.exists(config_path):
        try:
            with open(config_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except (json.JSONDecodeError, IOError) as e:
            print(f"Warning: failed to load robot_config.json: {e}", file=sys.stderr)
    return {}


def _resolve_bot_id(cli_bot_id: str) -> str:
    if cli_bot_id:
        return cli_bot_id
    return load_config().get('bot_id') or os.getenv("BOT_ID") or os.getenv("ASSISTANT_ID")


def _post(path: str, bot_id: str, payload: dict) -> dict:
    url = f"{DEFAULT_BACKEND_HOST}/v1/table_query/{path}/{bot_id}"
    auth_token = hashlib.md5(f"{DEFAULT_MASTERKEY}:{bot_id}".encode()).hexdigest()
    headers = {
        "content-type": "application/json",
        "authorization": f"Bearer {auth_token}",
    }
    trace_id = os.getenv("TRACE_ID") or os.getenv("X_REQUEST_ID")
    if trace_id:
        headers["X-Request-ID"] = trace_id
    resp = requests.post(url, json=payload, headers=headers, timeout=30)
    if resp.status_code != 200:
        raise RuntimeError(f"API {path} returned {resp.status_code}: {resp.text}")
    return resp.json()


def cmd_search_tables(args, bot_id: str) -> str:
    res = _post("search_tables", bot_id, {"query": args.query, "top_k": args.top_k})
    tables = res.get("tables", [])
    if not tables:
        return ("No matching tables found. If the question may be answered from documents "
                "instead of spreadsheets, fall back to the rag_retrieve document tool.")
    lines = [f"Found {len(tables)} candidate table(s). Pick the relevant ones and call "
             f"`get-schemas` for them next.\n"]
    for t in tables:
        lines.append(
            f"- table_name: {t['table_name']}\n"
            f"  file: {t.get('file_name','')} | sheet: {t.get('sheet_name','')} "
            f"| score: {round(t.get('score', 0), 3)}\n"
            f"  description: {t.get('table_description','')}"
        )
    return "\n".join(lines)


def cmd_get_schemas(args, bot_id: str) -> str:
    table_names = [t.strip() for t in args.tables.split(',') if t.strip()]
    res = _post("get_schemas", bot_id,
                {"table_names": table_names, "sample_rows": args.sample_rows})
    schemas = res.get("schemas", [])
    missing = res.get("missing_tables", [])
    if not schemas:
        return f"No schemas resolved. Missing tables: {missing}"
    blocks = []
    for s in schemas:
        block = [f"### Table: {s['table_name']}",
                 f"File: {s.get('file_name','')} | Sheet: {s.get('sheet_name','')}",
                 "```sql", s.get('sql_create', ''), "```"]
        sample = s.get('sample_rows') or []
        if sample:
            block.append("Sample rows (format hint only, NOT the row count):")
            block.append("```csv")
            for row in sample:
                block.append(",".join('"' + str(c).replace('"', '""') + '"' for c in row))
            block.append("```")
        blocks.append("\n".join(block))
    out = "\n\n".join(blocks)
    if missing:
        out += f"\n\nNote: these requested tables were not found: {missing}"
    out += ("\n\nNow author a SQLite plan and run it by piping the JSON to run-sql on stdin:\n"
            "  run-sql <<'PLAN'\n"
            "  {\"queries\": [{\"step\": 1, \"sql\": \"CREATE TEMP TABLE \\\"final_table_step1\\\" "
            "AS SELECT ...\", \"source_table_names\": [\"...\"], "
            "\"destine_table_name\": \"final_table_step1\", \"destine_table_type\": \"final\"}]}\n"
            "  PLAN\n"
            "Quote all identifiers with double quotes.")
    return out


def cmd_run_sql(args, bot_id: str) -> str:
    # Read the plan from --plan-file if given, otherwise from stdin (heredoc).
    try:
        if args.plan_file:
            with open(args.plan_file, 'r', encoding='utf-8') as f:
                raw = f.read()
        else:
            raw = sys.stdin.read()
        if not raw.strip():
            return ("Error: no plan provided. Pipe the JSON plan via stdin, e.g.\n"
                    "  python scripts/table_query.py run-sql <<'PLAN'\n"
                    "  {\"queries\": [...]}\n"
                    "  PLAN")
        plan = json.loads(raw)
    except (json.JSONDecodeError, IOError) as e:
        return f"Error: failed to read SQL plan: {e}"
    # accept either {"queries": [...]} or a bare [...] list
    queries = plan.get("queries") if isinstance(plan, dict) else plan
    if not queries:
        return "Error: the plan must contain a non-empty `queries` list."
    payload = {"queries": queries}
    if args.max_rows is not None:
        payload["max_rows"] = args.max_rows
    if args.cell_max is not None:
        payload["cell_max"] = args.cell_max
    res = _post("run_sql", bot_id, payload)
    if not res.get("success"):
        return (f"SQL execution failed: {res.get('error')}\n"
                "Fix your SQL and call run-sql again. Do NOT restart from search-tables.")
    parts = [TABLE_CITATION_INSTRUCTIONS]
    if res.get("instruction"):
        parts.append(res["instruction"])
    if res.get("knowledge"):
        parts.append(res["knowledge"])
    if res.get("extra_goal"):
        parts.append(res["extra_goal"])
    return "\n".join(parts)


def main():
    parser = argparse.ArgumentParser(description="table-query: fast LLM-free table querying")
    parser.add_argument("--bot-id", default=None, help="Bot id (defaults to robot_config.json)")
    sub = parser.add_subparsers(dest="command", required=True)

    p_search = sub.add_parser("search-tables", help="Vector-locate relevant tables")
    p_search.add_argument("--query", "-q", required=True, help="Rewritten retrieval query")
    p_search.add_argument("--top-k", "-k", type=int, default=20)

    p_schemas = sub.add_parser("get-schemas", help="Fetch CREATE TABLE schema + sample rows")
    p_schemas.add_argument("--tables", "-t", required=True, help="Comma-separated table names")
    p_schemas.add_argument("--sample-rows", type=int, default=3)

    p_run = sub.add_parser("run-sql", help="Execute an authored SQL plan (JSON via stdin or file)")
    p_run.add_argument("--plan-file", "-f", default=None,
                       help="Path to plan JSON file (optional; defaults to reading stdin)")
    p_run.add_argument("--max-rows", type=int, default=None,
                       help="Max total result rows (raise if a result came back truncated)")
    p_run.add_argument("--cell-max", type=int, default=None,
                       help="Max characters per cell before truncation")

    args = parser.parse_args()
    bot_id = _resolve_bot_id(args.bot_id)
    if not bot_id:
        print("Error: bot_id is required (robot_config.json / --bot-id / BOT_ID env)")
        sys.exit(1)

    try:
        if args.command == "search-tables":
            print(cmd_search_tables(args, bot_id))
        elif args.command == "get-schemas":
            print(cmd_get_schemas(args, bot_id))
        elif args.command == "run-sql":
            print(cmd_run_sql(args, bot_id))
    except Exception as e:
        print(f"Error: {str(e)}")
        sys.exit(1)


if __name__ == "__main__":
    main()