diff --git a/mcp/rag_retrieve_server.py b/mcp/rag_retrieve_server.py index e44d50d..80a659f 100644 --- a/mcp/rag_retrieve_server.py +++ b/mcp/rag_retrieve_server.py @@ -29,6 +29,49 @@ from mcp_common import ( BACKEND_HOST = os.getenv("BACKEND_HOST", "https://api-dev.gptbase.ai") MASTERKEY = os.getenv("MASTERKEY", "master") +# Citation instruction prefixes injected into tool results +DOCUMENT_CITATION_INSTRUCTIONS = """ +When using the retrieved knowledge below, you MUST add XML citation tags for factual claims. + +## Document Knowledge +Format: `` +- Use `file` attribute with the UUID from document markers +- Use `filename` attribute with the actual filename from document markers +- Use `page` attribute (singular) with the page number +- `page` MUST be 0-based and must match the `pages:` values shown in the learned knowledge context + +## Web Page Knowledge +Format: `` +- Use `url` attribute with the web page URL from the source metadata +- Do not use `file`, `filename`, or `page` attributes for web sources +- If content is grounded in a web source, prefer a web citation with `url` over a file citation + +## Placement Rules +- Citations MUST appear IMMEDIATELY AFTER the paragraph or bullet list that uses the knowledge +- NEVER collect all citations and place them at the end of your response +- Limit to 1-2 citations per paragraph/bullet list +- If your answer uses learned knowledge, you MUST generate at least 1 `` in the response + + +""" + +TABLE_CITATION_INSTRUCTIONS = """ +When using the retrieved table knowledge below, you MUST add XML citation tags for factual claims. + +Format: `` +- Parse `__src`: `F1S2R5` = file_ref F1, sheet 2, row 5 +- Look up file_id in `file_ref_table` +- Combine same-sheet rows into one citation: `rows=[2, 4, 6]` +- MANDATORY: Create SEPARATE citation for EACH (file, sheet) combination +- NEVER put on the same line as a bullet point or table row +- Citations MUST be on separate lines AFTER the complete list/table +- NEVER include the `__src` column in your response - it is internal metadata only +- Citations MUST appear IMMEDIATELY AFTER the paragraph or bullet list that uses the knowledge +- NEVER collect all citations and place them at the end of your response + + +""" + def rag_retrieve(query: str, top_k: int = 100) -> Dict[str, Any]: """调用RAG检索API""" try: @@ -94,7 +137,7 @@ def rag_retrieve(query: str, top_k: int = 100) -> Dict[str, Any]: "content": [ { "type": "text", - "text": markdown_content + "text": DOCUMENT_CITATION_INSTRUCTIONS + markdown_content } ] } @@ -107,7 +150,7 @@ def rag_retrieve(query: str, top_k: int = 100) -> Dict[str, Any]: } ] } - + except requests.exceptions.RequestException as e: return { "content": [ @@ -179,7 +222,7 @@ def table_rag_retrieve(query: str) -> Dict[str, Any]: "content": [ { "type": "text", - "text": markdown_content + "text": TABLE_CITATION_INSTRUCTIONS + markdown_content } ] } diff --git a/prompt/system_prompt.md b/prompt/system_prompt.md index 8162c7f..c978779 100644 --- a/prompt/system_prompt.md +++ b/prompt/system_prompt.md @@ -2,83 +2,13 @@ ## CITATION REQUIREMENTS -### A. Regular Document Knowledge -When answering questions based on `rag_retrieve` tool results, you MUST add XML citation tags for factual claims derived from the knowledge base. +When your answer uses learned knowledge, you MUST generate `` tags. Follow the specific citation format instructions returned by each tool (`rag_retrieve`, `table_rag_retrieve`). -**Format:** `` -- Use `file` attribute with the UUID from document markers -- Use `filename` attribute with the actual filename from document markers -- Use `page` attribute (singular) with the page number -- `page` MUST be 0-based and must match the `pages:` values shown in the learned knowledge context - -### B. Table Knowledge (TABLE_KNOWLEDGE BEGIN/END) -When answering questions based on `table_rag_retrieve` tool results, you MUST add XML citation tags for factual claims derived from the knowledge base. - -**!!! CRITICAL RULE: NEVER put on same line as bullet/row !!!** -**Citations MUST be on separate lines AFTER the complete list/table.** -**NEVER include the `__src` column in your response - it is internal metadata only.** - -Format: `` -- Parse `__src`: `F1S2R5` = file_ref F1, sheet 2, row 5 -- Look up file_id in `file_ref_table` -- Combine same-sheet rows into one citation: `rows=[2, 4, 6]` -- **MANDATORY: Create SEPARATE citation for EACH (file, sheet) combination** - -✅ CORRECT (data from sheet 1 AND sheet 2 = 2 citations): -1. Liam - male -2. Noah - male -3. Ethan - male -4. Mason - male -5. William - male - - - -❌ WRONG (citation on same line): -1. Liam - male -❌ WRONG (missing sheet 2 citation): -...only 1 citation when data comes from 2 sheets... - - -### C. Web Page Knowledge - -**Format:** `` -- Use `url` attribute with the web page URL from the source metadata -- Do not use `file`, `filename`, or `page` attributes for web sources -- Web citations should appear immediately after the content they reference - -**!!! CRITICAL PLACEMENT RULES !!!** -1. **Citations MUST appear IMMEDIATELY AFTER the paragraph or bullet list** that uses the knowledge -2. **NEVER collect all citations and place them at the end of your response** -3. **Limit to 1-2 citations per paragraph/bullet list** - combine related facts under one citation -4. **If your answer uses learned knowledge, you MUST generate at least 1 `` in the response** -5. **If any paragraph or bullet list is grounded in a web source, prefer a web citation with `url` over a file citation** - -✅ CORRECT (citation immediately after paragraph): -氣候變遷的影響包括世界平均氣溫持續上升,2024年為有紀錄以來最熱的一年。 - -具體影響包括: -- 極端高溫事件頻率增加 -- 海洋熱浪 -- 暴雨強度和頻率增強 - -✅ CORRECT (web citation): -MIMURE位于东京港区高轮,是一家综合性商业设施。 - -❌ WRONG (all citations at the end): -氣候變遷的影響包括...(long response)... - - - - -(13 citations dumped at the end) - -❌ WRONG (web citation with file attributes): -MIMURE位于东京港区高轮,是一家综合性商业设施。 - -❌ WRONG (too many citations for short content): -2024年全球氣溫上升。 -世界各地發生災害。 -沙烏地阿拉伯熱浪。 +### General Placement Rules +1. Citations MUST appear IMMEDIATELY AFTER the paragraph or bullet list that uses the knowledge +2. NEVER collect all citations and place them at the end of your response +3. Limit to 1-2 citations per paragraph/bullet list - combine related facts under one citation +4. If your answer uses learned knowledge, you MUST generate at least 1 `` in the response ### Current Working Directory