diff --git a/mcp/rag_retrieve_server.py b/mcp/rag_retrieve_server.py index e44d50d..80a659f 100644 --- a/mcp/rag_retrieve_server.py +++ b/mcp/rag_retrieve_server.py @@ -29,6 +29,49 @@ from mcp_common import ( BACKEND_HOST = os.getenv("BACKEND_HOST", "https://api-dev.gptbase.ai") MASTERKEY = os.getenv("MASTERKEY", "master") +# Citation instruction prefixes injected into tool results +DOCUMENT_CITATION_INSTRUCTIONS = """ +When using the retrieved knowledge below, you MUST add XML citation tags for factual claims. + +## Document Knowledge +Format: `` +- Use `file` attribute with the UUID from document markers +- Use `filename` attribute with the actual filename from document markers +- Use `page` attribute (singular) with the page number +- `page` MUST be 0-based and must match the `pages:` values shown in the learned knowledge context + +## Web Page Knowledge +Format: `` +- Use `url` attribute with the web page URL from the source metadata +- Do not use `file`, `filename`, or `page` attributes for web sources +- If content is grounded in a web source, prefer a web citation with `url` over a file citation + +## Placement Rules +- Citations MUST appear IMMEDIATELY AFTER the paragraph or bullet list that uses the knowledge +- NEVER collect all citations and place them at the end of your response +- Limit to 1-2 citations per paragraph/bullet list +- If your answer uses learned knowledge, you MUST generate at least 1 `` in the response + + +""" + +TABLE_CITATION_INSTRUCTIONS = """ +When using the retrieved table knowledge below, you MUST add XML citation tags for factual claims. + +Format: `` +- Parse `__src`: `F1S2R5` = file_ref F1, sheet 2, row 5 +- Look up file_id in `file_ref_table` +- Combine same-sheet rows into one citation: `rows=[2, 4, 6]` +- MANDATORY: Create SEPARATE citation for EACH (file, sheet) combination +- NEVER put on the same line as a bullet point or table row +- Citations MUST be on separate lines AFTER the complete list/table +- NEVER include the `__src` column in your response - it is internal metadata only +- Citations MUST appear IMMEDIATELY AFTER the paragraph or bullet list that uses the knowledge +- NEVER collect all citations and place them at the end of your response + + +""" + def rag_retrieve(query: str, top_k: int = 100) -> Dict[str, Any]: """调用RAG检索API""" try: @@ -94,7 +137,7 @@ def rag_retrieve(query: str, top_k: int = 100) -> Dict[str, Any]: "content": [ { "type": "text", - "text": markdown_content + "text": DOCUMENT_CITATION_INSTRUCTIONS + markdown_content } ] } @@ -107,7 +150,7 @@ def rag_retrieve(query: str, top_k: int = 100) -> Dict[str, Any]: } ] } - + except requests.exceptions.RequestException as e: return { "content": [ @@ -179,7 +222,7 @@ def table_rag_retrieve(query: str) -> Dict[str, Any]: "content": [ { "type": "text", - "text": markdown_content + "text": TABLE_CITATION_INSTRUCTIONS + markdown_content } ] } diff --git a/prompt/system_prompt.md b/prompt/system_prompt.md index 32439de..c1d4812 100644 --- a/prompt/system_prompt.md +++ b/prompt/system_prompt.md @@ -1,5 +1,16 @@ {extra_prompt} +## CITATION REQUIREMENTS + +When your answer uses learned knowledge, you MUST generate `` tags. Follow the specific citation format instructions returned by each tool (`rag_retrieve`, `table_rag_retrieve`). + +### General Placement Rules +1. Citations MUST appear IMMEDIATELY AFTER the paragraph or bullet list that uses the knowledge +2. NEVER collect all citations and place them at the end of your response +3. Limit to 1-2 citations per paragraph/bullet list - combine related facts under one citation +4. If your answer uses learned knowledge, you MUST generate at least 1 `` in the response + + ### Current Working Directory PROJECT_ROOT: `{agent_dir_path}`