qwen_agent/skills/kfs-answer/scripts/merge_citations.py

"""Merge accumulated citation data into final CITATION tags.

Usage: python3 merge_citations.py

Reads {session_dir}/citations.jsonl (appended by query.py / query_db.py),
merges rows by (file, sheet), outputs one <CITATION .../> tag per combination.

Agent calls this ONCE before composing the final answer (Step 4).
"""
import json
import os
import sys

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from _session import get_session_dir


def main():
    citations_path = os.path.join(get_session_dir(), "citations.jsonl")
    if not os.path.isfile(citations_path):
        print("[NO CITATIONS]")
        return

    # Read all citation entries
    groups = {}  # (file, sheet) -> {"filename": str, "rows": set}
    with open(citations_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            entry = json.loads(line)
            key = (entry["file"], entry["sheet"])
            if key not in groups:
                groups[key] = {"filename": entry.get("filename", ""), "rows": set()}
            for r in entry.get("rows", []):
                groups[key]["rows"].add(r)

    if not groups:
        print("[NO CITATIONS]")
        return

    print("[CITATIONS]")
    for (file_id, sheet_num) in sorted(groups.keys()):
        info = groups[(file_id, sheet_num)]
        fn_attr = f' filename="{info["filename"]}"' if info["filename"] else ""
        rows = sorted(info["rows"])
        if rows:
            rows_str = "[" + ", ".join(str(r) for r in rows) + "]"
            print(f'<CITATION file="{file_id}"{fn_attr} sheet="{sheet_num}" rows="{rows_str}" />')
        else:
            # Sheet-level citation (markdown, no rows)
            print(f'<CITATION file="{file_id}"{fn_attr} sheet="{sheet_num}" />')


if __name__ == "__main__":
    main()