remove guildline

This commit is contained in:
朱潮 2025-12-08 19:13:36 +08:00
parent 7fbebef764
commit 55651c38d0
7 changed files with 26 additions and 2470 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,9 @@
question,regex,llm-rubric,,,,,,,,,,,,,,,,,
田中花子在哪里,\[TOOL_CALL\].*find_employee_location,,,,,,,,,,,,,,,,,,
通知明天上午8点开会,,询问是否确认发送,,,,,,,,,,,,,,,,,
清水太郎在哪里,\[TOOL_CALL\].*find_employee_location,,,,,,,,,,,,,,,,,,
通知明天上午8点开会,,询问是否确认发送,,,,,,,,,,,,,,,,,
确认,\[TOOL_CALL\].*wowtalk_send_message_to_member,,,,,,,,,,,,,,,,,,
把DefineRoom 4的灯光状态发给田中花子,,调用find_devices_by_room和dxcore_get_device_status获取灯光状态并询问是否确认发送。,,,,,,,,,,,,,,,,,
把DefineRoom 4的灯光状态发给,,调用find_devices_by_room和dxcore_get_device_status获取灯光状态并询问是否确认发送。,,,,,,,,,,,,,,,,,
确认,\[TOOL_CALL\].*wowtalk_send_message_to_member,,,,,,,,,,,,,,,,,,
900142の稼働状況,\[TOOL_CALL\].*dxcore_get_device_status,,,,,,,,,,,,,,,,,,
关闭设备900142的灯光,,询问是否确认关闭,,,,,,,,,,,,,,,,,
确认,\[TOOL_CALL\].*dxcore_update_device_status,,,,,,,,,,,,,,,,,,
1 question regex llm-rubric
2 田中花子在哪里 清水太郎在哪里 \[TOOL_CALL\].*find_employee_location
3 通知她明天上午8点开会 通知他明天上午8点开会 询问是否确认发送
4 确认 \[TOOL_CALL\].*wowtalk_send_message_to_member
5 把DefineRoom 4的灯光状态发给田中花子 把DefineRoom 4的灯光状态发给他 调用find_devices_by_room和dxcore_get_device_status获取灯光状态,并询问是否确认发送。
6 确认 \[TOOL_CALL\].*wowtalk_send_message_to_member
7 900142の稼働状況 \[TOOL_CALL\].*dxcore_get_device_status
8 关闭设备900142的灯光 询问是否确认关闭
9 确认 \[TOOL_CALL\].*dxcore_update_device_status

View File

@ -1,10 +1,10 @@
- vars:
question: 田中花子在哪里
question: 清水太郎在哪里
assert:
- type: regex
value: \[TOOL_CALL\].*find_employee_location
- vars:
question: 通知明天上午8点开会
question: 通知明天上午8点开会
assert:
- type: llm-rubric
value: 询问是否确认发送
@ -14,7 +14,7 @@
- type: regex
value: \[TOOL_CALL\].*wowtalk_send_message_to_member
- vars:
question: 把DefineRoom 4的灯光状态发给田中花子
question: 把DefineRoom 4的灯光状态发给
assert:
- type: llm-rubric
value: 调用find_devices_by_room和dxcore_get_device_status获取灯光状态并询问是否确认发送。
@ -23,3 +23,18 @@
assert:
- type: regex
value: \[TOOL_CALL\].*wowtalk_send_message_to_member
- vars:
question: 900142の稼働状況
assert:
- type: regex
value: \[TOOL_CALL\].*dxcore_get_device_status
- vars:
question: 关闭设备900142的灯光
assert:
- type: llm-rubric
value: 询问是否确认关闭
- vars:
question: 确认
assert:
- type: regex
value: \[TOOL_CALL\].*dxcore_update_device_status

View File

@ -1,137 +0,0 @@
{
"evalId": "eval-kAj-2025-12-04T14:31:43",
"results": {
"version": 3,
"timestamp": "2025-12-04T14:31:43.709Z",
"prompts": [
{
"raw": "[{\"role\":\"user\",\"content\":\"{{ question }}\"}]",
"label": "prompt.json: [{\"role\":\"user\",\"content\":\"{{ question }}\"}]",
"id": "7fd9d6ab1656b5f683dd7d34fc535754cd42291c7b78f2aa5fd68b3e43dae7b6",
"provider": "openai:chat:qwen3",
"metrics": {
"score": 0,
"testPassCount": 0,
"testFailCount": 0,
"testErrorCount": 0,
"assertPassCount": 0,
"assertFailCount": 0,
"totalLatencyMs": 0,
"tokenUsage": {
"prompt": 0,
"completion": 0,
"cached": 0,
"total": 0,
"numRequests": 0,
"completionDetails": {
"reasoning": 0,
"acceptedPrediction": 0,
"rejectedPrediction": 0
},
"assertions": {
"total": 0,
"prompt": 0,
"completion": 0,
"cached": 0,
"numRequests": 0,
"completionDetails": {
"reasoning": 0,
"acceptedPrediction": 0,
"rejectedPrediction": 0
}
}
},
"namedScores": {},
"namedScoresCount": {},
"cost": 0
}
}
],
"results": [],
"stats": {
"successes": 0,
"failures": 0,
"errors": 0,
"tokenUsage": {
"prompt": 0,
"completion": 0,
"cached": 0,
"total": 0,
"numRequests": 0,
"completionDetails": {
"reasoning": 0,
"acceptedPrediction": 0,
"rejectedPrediction": 0
},
"assertions": {
"total": 0,
"prompt": 0,
"completion": 0,
"cached": 0,
"numRequests": 0,
"completionDetails": {
"reasoning": 0,
"acceptedPrediction": 0,
"rejectedPrediction": 0
}
}
}
}
},
"config": {
"tags": {},
"description": "Novare Test",
"prompts": [
"file:///Users/moshui/Documents/felo/qwen-agent/promptfoo/query/prompt.json"
],
"providers": [
{
"id": "openai:chat:qwen3",
"config": {
"apiBaseUrl": "https://catalog-agent-dev.gbase.ai/api/v2",
"apiKey": "a21c99620a8ef61d69563afe05ccce89",
"passthrough": {
"bot_id": "63069654-7750-409d-9a58-a0960d899a20",
"tool_response": true,
"language": "zh"
}
}
}
],
"tests": [],
"scenarios": [],
"env": {},
"sharing": true,
"defaultTest": {
"options": {
"provider": {
"text": {
"id": "openai:chat:qwen/qwen3-next-80b-a3b-instruct",
"config": {
"apiKey": "sk-hsKClH0Z695EkK5fDdB2Ec2fE13f4fC1B627BdBb8e554b5b-26",
"apiBaseUrl": "https://one.felo.me/v1",
"apiVersion": "2024-02-01"
}
}
}
},
"vars": {},
"assert": [],
"metadata": {}
},
"outputPath": [
"result.json"
],
"extensions": [],
"metadata": {}
},
"shareableUrl": null,
"metadata": {
"promptfooVersion": "0.117.11",
"nodeVersion": "v20.10.0",
"platform": "darwin",
"arch": "x64",
"exportedAt": "2025-12-04T14:31:43.766Z",
"evaluationCreatedAt": "2025-12-04T14:31:43.709Z"
}
}

View File

@ -1,6 +1,4 @@
question,regex,llm-rubric,,,,,,,,,,,,,,,,,
940092の稼働状況,\[TOOL_CALL\].*dxcore_get_device_status,,,,,,,,,,,,,,,,,,
关闭设备900142的灯光,\[TOOL_CALL\].*dxcore_update_device_status,,,,,,,,,,,,,,,,,,
Define Room1 的灯光状态,\[TOOL_CALL\].*find_devices_by_room,,,,,,,,,,,,,,,,,,
卫生间在哪里,\[TOOL_CALL\].*rag_retrieve,,,,,,,,,,,,,,,,,,
我丢了物品怎么办,\[TOOL_CALL\].*rag_retrieve,,,,,,,,,,,,,,,,,,

1 question regex llm-rubric
940092の稼働状況 \[TOOL_CALL\].*dxcore_get_device_status
关闭设备900142的灯光 \[TOOL_CALL\].*dxcore_update_device_status
2 Define Room1 的灯光状态 \[TOOL_CALL\].*find_devices_by_room
3 卫生间在哪里 \[TOOL_CALL\].*rag_retrieve
4 我丢了物品怎么办 \[TOOL_CALL\].*rag_retrieve

View File

@ -1,13 +1,3 @@
- vars:
question: 940092の稼働状況
assert:
- type: regex
value: \[TOOL_CALL\].*dxcore_get_device_status
- vars:
question: 关闭设备900142的灯光
assert:
- type: regex
value: \[TOOL_CALL\].*dxcore_update_device_status
- vars:
question: Define Room1 的灯光状态
assert:

View File

@ -659,6 +659,7 @@ def extract_block_from_system_prompt(system_prompt: str) -> tuple[str, str, str,
if block_type == 'guideline' or block_type == 'guidelines':
guidelines = content.strip()
blocks_to_remove.append(match.group(0))
elif block_type == 'tools':
tools = content.strip()
elif block_type == 'scenarios':