add promptfoo

This commit is contained in:
朱潮 2025-12-09 20:17:21 +08:00
parent ee41279569
commit 092f7de0c3
11 changed files with 70 additions and 83 deletions

View File

@ -1,9 +0,0 @@
question,regex,llm-rubric,,,,,,,,,,,,,,,,,
清水太郎在哪里,\[TOOL_CALL\].*find_employee_location,,,,,,,,,,,,,,,,,,
通知他明天上午8点开会,,询问是否确认发送,,,,,,,,,,,,,,,,,
确认,\[TOOL_CALL\].*wowtalk_send_message_to_member,,,,,,,,,,,,,,,,,,
把DefineRoom 4的灯光状态发给他,,调用find_devices_by_room和dxcore_get_device_status获取灯光状态并询问是否确认发送。,,,,,,,,,,,,,,,,,
确认,\[TOOL_CALL\].*wowtalk_send_message_to_member,,,,,,,,,,,,,,,,,,
900142の稼働状況,\[TOOL_CALL\].*dxcore_get_device_status,,,,,,,,,,,,,,,,,,
关闭设备900142的灯光,,询问是否确认关闭,,,,,,,,,,,,,,,,,
确认,\[TOOL_CALL\].*dxcore_update_device_status,,,,,,,,,,,,,,,,,,
1 question regex llm-rubric
2 清水太郎在哪里 \[TOOL_CALL\].*find_employee_location
3 通知他明天上午8点开会 询问是否确认发送
4 确认 \[TOOL_CALL\].*wowtalk_send_message_to_member
5 把DefineRoom 4的灯光状态发给他 调用find_devices_by_room和dxcore_get_device_status获取灯光状态,并询问是否确认发送。
6 确认 \[TOOL_CALL\].*wowtalk_send_message_to_member
7 900142の稼働状況 \[TOOL_CALL\].*dxcore_get_device_status
8 关闭设备900142的灯光 询问是否确认关闭
9 确认 \[TOOL_CALL\].*dxcore_update_device_status

View File

@ -15,7 +15,8 @@ def csv_to_yaml(csv_file, yaml_file):
if row['question']:
test_case = {
'vars': {
'question': row['question'].strip()
'question': row['question'].strip(),
'use_history': True if row['use_history'] == "1" else False,
},
'assert':[]
}
@ -41,5 +42,4 @@ def csv_to_yaml(csv_file, yaml_file):
print(f"Converted {len(tests)} test cases from {csv_file} to {yaml_file}")
if __name__ == '__main__':
csv_to_yaml("conversation/tests.csv", "conversation/tests.yaml")
csv_to_yaml("query/tests.csv", "query/tests.yaml")
csv_to_yaml("novare/novare.csv", "novare/tests.yaml")

View File

@ -1,5 +1,5 @@
env: {}
description: Novare Test
description: Novare Test - Unified Config for Both Single and Conversation
providers:
- id: openai:chat:qwen3
config:
@ -25,4 +25,4 @@ defaultTest:
apiVersion: '2024-02-01'
evaluateOptions: {}
writeLatestResults: true
sharing: true
sharing: true

View File

@ -0,0 +1,14 @@
question,use_history,regex,llm-rubric,,,,,,,,,,,,,,,,,
清水太郎在哪里,1,\[TOOL_CALL\].*find_employee_location,,,,,,,,,,,,,,,,,,
通知他明天上午8点开会,1,,询问是否确认发送,,,,,,,,,,,,,,,,,
确认,1,\[TOOL_CALL\].*wowtalk_send_message_to_member,,,,,,,,,,,,,,,,,,
把DefineRoom 4的灯光状态发给他,1,,调用find_devices_by_room和dxcore_get_device_status获取灯光状态并询问是否确认发送。,,,,,,,,,,,,,,,,,
确认,1,\[TOOL_CALL\].*wowtalk_send_message_to_member,,,,,,,,,,,,,,,,,,
关闭设备900142的灯光,1,,询问是否确认关闭,,,,,,,,,,,,,,,,,
确认,1,\[TOOL_CALL\].*dxcore_update_device_status,,,,,,,,,,,,,,,,,,
Define Room1 的灯光状态,0,\[TOOL_CALL\].*find_devices_by_room,,,,,,,,,,,,,,,,,,
900142の稼働状況,0,\[TOOL_CALL\].*dxcore_get_device_status,,,,,,,,,,,,,,,,,,
卫生间在哪里,0,\[TOOL_CALL\].*rag_retrieve,,,,,,,,,,,,,,,,,,
我丢了物品怎么办,0,\[TOOL_CALL\].*rag_retrieve,,,,,,,,,,,,,,,,,,
咖啡多少钱一杯,0,\[TOOL_CALL\].*rag_retrieve,,,,,,,,,,,,,,,,,,
东京明天的天气,0,\[TOOL_CALL\].*weather_get_by_location,,,,,,,,,,,,,,,,,,
1 question use_history regex llm-rubric
2 清水太郎在哪里 1 \[TOOL_CALL\].*find_employee_location
3 通知他明天上午8点开会 1 询问是否确认发送
4 确认 1 \[TOOL_CALL\].*wowtalk_send_message_to_member
5 把DefineRoom 4的灯光状态发给他 1 调用find_devices_by_room和dxcore_get_device_status获取灯光状态,并询问是否确认发送。
6 确认 1 \[TOOL_CALL\].*wowtalk_send_message_to_member
7 关闭设备900142的灯光 1 询问是否确认关闭
8 确认 1 \[TOOL_CALL\].*dxcore_update_device_status
9 Define Room1 的灯光状态 0 \[TOOL_CALL\].*find_devices_by_room
10 900142の稼働状況 0 \[TOOL_CALL\].*dxcore_get_device_status
11 卫生间在哪里 0 \[TOOL_CALL\].*rag_retrieve
12 我丢了物品怎么办 0 \[TOOL_CALL\].*rag_retrieve
13 咖啡多少钱一杯 0 \[TOOL_CALL\].*rag_retrieve
14 东京明天的天气 0 \[TOOL_CALL\].*weather_get_by_location

View File

@ -1,4 +1,5 @@
[
{% if use_history %}
{% for completion in _conversation %}
{
"role": "user",
@ -13,4 +14,10 @@
"role": "user",
"content": "{{ question | encode }}"
}
]
{% else %}
{
"role": "user",
"content": "{{ question }}"
}
{% endif %}
]

View File

@ -1,40 +1,78 @@
- vars:
question: 清水太郎在哪里
use_history: true
assert:
- type: regex
value: \[TOOL_CALL\].*find_employee_location
- vars:
question: 通知他明天上午8点开会
use_history: true
assert:
- type: llm-rubric
value: 询问是否确认发送
- vars:
question: 确认
use_history: true
assert:
- type: regex
value: \[TOOL_CALL\].*wowtalk_send_message_to_member
- vars:
question: 把DefineRoom 4的灯光状态发给他
use_history: true
assert:
- type: llm-rubric
value: 调用find_devices_by_room和dxcore_get_device_status获取灯光状态并询问是否确认发送。
- vars:
question: 确认
use_history: true
assert:
- type: regex
value: \[TOOL_CALL\].*wowtalk_send_message_to_member
- vars:
question: 900142の稼働状況
assert:
- type: regex
value: \[TOOL_CALL\].*dxcore_get_device_status
- vars:
question: 关闭设备900142的灯光
use_history: true
assert:
- type: llm-rubric
value: 询问是否确认关闭
- vars:
question: 确认
use_history: true
assert:
- type: regex
value: \[TOOL_CALL\].*dxcore_update_device_status
- vars:
question: Define Room1 的灯光状态
use_history: false
assert:
- type: regex
value: \[TOOL_CALL\].*find_devices_by_room
- vars:
question: 900142の稼働状況
use_history: false
assert:
- type: regex
value: \[TOOL_CALL\].*dxcore_get_device_status
- vars:
question: 卫生间在哪里
use_history: false
assert:
- type: regex
value: \[TOOL_CALL\].*rag_retrieve
- vars:
question: 我丢了物品怎么办
use_history: false
assert:
- type: regex
value: \[TOOL_CALL\].*rag_retrieve
- vars:
question: 咖啡多少钱一杯
use_history: false
assert:
- type: regex
value: \[TOOL_CALL\].*rag_retrieve
- vars:
question: 东京明天的天气
use_history: false
assert:
- type: regex
value: \[TOOL_CALL\].*weather_get_by_location

View File

@ -1,26 +0,0 @@
env: {}
description: Novare Test
providers:
- id: openai:chat:qwen3
config:
apiBaseUrl: https://catalog-agent-dev.gbase.ai/api/v2
apiKey: a21c99620a8ef61d69563afe05ccce89
passthrough:
bot_id: 63069654-7750-409d-9a58-a0960d899a20
tool_response: true
language: zh
prompts:
- file://prompt.json
tests: file://tests.yaml
defaultTest:
options:
provider:
text:
id: openai:chat:qwen/qwen3-next-80b-a3b-instruct
config:
apiKey: sk-hsKClH0Z695EkK5fDdB2Ec2fE13f4fC1B627BdBb8e554b5b-26
apiBaseUrl: https://one.felo.me/v1
apiVersion: '2024-02-01'
evaluateOptions: {}
writeLatestResults: true
sharing: true

View File

@ -1,6 +0,0 @@
[
{
"role": "user",
"content": "{{ question }}"
}
]

View File

@ -1,6 +0,0 @@
question,regex,llm-rubric,,,,,,,,,,,,,,,,,
Define Room1 的灯光状态,\[TOOL_CALL\].*find_devices_by_room,,,,,,,,,,,,,,,,,,
卫生间在哪里,\[TOOL_CALL\].*rag_retrieve,,,,,,,,,,,,,,,,,,
我丢了物品怎么办,\[TOOL_CALL\].*rag_retrieve,,,,,,,,,,,,,,,,,,
咖啡多少钱一杯,\[TOOL_CALL\].*rag_retrieve,,,,,,,,,,,,,,,,,,
东京明天的天气,\[TOOL_CALL\].*weather_get_by_location,,,,,,,,,,,,,,,,,,
1 question regex llm-rubric
2 Define Room1 的灯光状态 \[TOOL_CALL\].*find_devices_by_room
3 卫生间在哪里 \[TOOL_CALL\].*rag_retrieve
4 我丢了物品怎么办 \[TOOL_CALL\].*rag_retrieve
5 咖啡多少钱一杯 \[TOOL_CALL\].*rag_retrieve
6 东京明天的天气 \[TOOL_CALL\].*weather_get_by_location

View File

@ -1,25 +0,0 @@
- vars:
question: Define Room1 的灯光状态
assert:
- type: regex
value: \[TOOL_CALL\].*find_devices_by_room
- vars:
question: 卫生间在哪里
assert:
- type: regex
value: \[TOOL_CALL\].*rag_retrieve
- vars:
question: 我丢了物品怎么办
assert:
- type: regex
value: \[TOOL_CALL\].*rag_retrieve
- vars:
question: 咖啡多少钱一杯
assert:
- type: regex
value: \[TOOL_CALL\].*rag_retrieve
- vars:
question: 东京明天的天气
assert:
- type: regex
value: \[TOOL_CALL\].*weather_get_by_location