add regex grep
This commit is contained in:
parent
8cf7f956d6
commit
3591d8228e
@ -19,8 +19,8 @@ RUN sed -i 's|http://deb.debian.org|http://mirrors.aliyun.com|g' /etc/apt/source
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt
|
||||
|
||||
# 复制应用代码
|
||||
COPY . .
|
||||
# 安装modelscope
|
||||
RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ modelscope
|
||||
|
||||
# 创建必要的目录
|
||||
RUN mkdir -p /app/projects
|
||||
@ -28,12 +28,12 @@ RUN mkdir -p /app/public
|
||||
RUN mkdir -p /app/models
|
||||
RUN mkdir -p /app/queue_data
|
||||
|
||||
# 安装modelscope
|
||||
RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ modelscope
|
||||
|
||||
# 从modelscope下载sentence-transformers模型到models目录
|
||||
RUN python -c "from modelscope import snapshot_download; model_dir = snapshot_download('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'); import shutil; shutil.move(model_dir, '/app/models/paraphrase-multilingual-MiniLM-L12-v2')"
|
||||
|
||||
# 复制应用代码
|
||||
COPY . .
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 8001
|
||||
|
||||
|
||||
@ -2,7 +2,9 @@ version: "3.8"
|
||||
|
||||
services:
|
||||
qwen-agent:
|
||||
build: .
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.modelscope
|
||||
container_name: qwen-agent-api
|
||||
ports:
|
||||
- "8001:8001"
|
||||
|
||||
@ -611,6 +611,369 @@ def find_file_in_project(filename: str, project_dir: str) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def regex_grep(pattern: str, file_paths: List[str], context_lines: int = 0,
|
||||
case_sensitive: bool = False, limit: int = 50) -> Dict[str, Any]:
|
||||
"""使用正则表达式搜索文件内容,支持上下文行"""
|
||||
if not pattern:
|
||||
return {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Error: Pattern cannot be empty"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
if not file_paths:
|
||||
return {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Error: File path list cannot be empty"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# 编译正则表达式
|
||||
try:
|
||||
flags = 0 if case_sensitive else re.IGNORECASE
|
||||
compiled_pattern = re.compile(pattern, flags)
|
||||
except re.error as e:
|
||||
return {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"Error: Invalid regular expression '{pattern}': {str(e)}"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# 处理项目目录限制
|
||||
project_data_dir = get_allowed_directory()
|
||||
|
||||
# 验证文件路径
|
||||
valid_paths = []
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
# 解析相对路径
|
||||
if not os.path.isabs(file_path):
|
||||
# 移除 projects/ 前缀(如果存在)
|
||||
clean_path = file_path
|
||||
if clean_path.startswith('projects/'):
|
||||
clean_path = clean_path[9:] # 移除 'projects/' 前缀
|
||||
elif clean_path.startswith('./projects/'):
|
||||
clean_path = clean_path[11:] # 移除 './projects/' 前缀
|
||||
|
||||
# 尝试在项目目录中查找文件
|
||||
full_path = os.path.join(project_data_dir, clean_path.lstrip('./'))
|
||||
if os.path.exists(full_path):
|
||||
valid_paths.append(full_path)
|
||||
else:
|
||||
# 如果直接路径不存在,尝试递归查找
|
||||
found = find_file_in_project(clean_path, project_data_dir)
|
||||
if found:
|
||||
valid_paths.append(found)
|
||||
else:
|
||||
if file_path.startswith(project_data_dir) and os.path.exists(file_path):
|
||||
valid_paths.append(file_path)
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
if not valid_paths:
|
||||
return {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"Error: Specified files not found in project directory {project_data_dir}"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# 收集所有匹配结果
|
||||
all_results = []
|
||||
|
||||
for file_path in valid_paths:
|
||||
try:
|
||||
results = regex_search_in_file(file_path, compiled_pattern, context_lines, case_sensitive)
|
||||
all_results.extend(results)
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
# 按文件路径和行号排序
|
||||
all_results.sort(key=lambda x: (x['file_path'], x['match_line_number']))
|
||||
|
||||
# 限制结果数量
|
||||
limited_results = all_results[:limit]
|
||||
|
||||
# 格式化输出
|
||||
if not limited_results:
|
||||
return {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "No matches found"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# 格式化输出
|
||||
formatted_lines = []
|
||||
|
||||
# 显示总匹配数量
|
||||
total_matches = len(all_results)
|
||||
showing_count = len(limited_results)
|
||||
summary_line = f"Found {total_matches} matches, showing top {showing_count} results:"
|
||||
formatted_lines.append(summary_line)
|
||||
|
||||
# 按文件分组显示结果
|
||||
current_file = None
|
||||
for result in limited_results:
|
||||
file_path = result['file_path']
|
||||
if file_path != current_file:
|
||||
current_file = file_path
|
||||
file_name = os.path.basename(file_path)
|
||||
formatted_lines.append(f"\n--- File: {file_name} ---")
|
||||
|
||||
match_line = result['match_line_number']
|
||||
match_text = result['match_text']
|
||||
matched_content = result['matched_content']
|
||||
|
||||
# 显示匹配行
|
||||
formatted_lines.append(f"{match_line}:{matched_content}")
|
||||
|
||||
# 显示上下文行
|
||||
if 'context_before' in result:
|
||||
for context_line in result['context_before']:
|
||||
formatted_lines.append(f"{context_line['line_number']}:{context_line['content']}")
|
||||
|
||||
if 'context_after' in result:
|
||||
for context_line in result['context_after']:
|
||||
formatted_lines.append(f"{context_line['line_number']}:{context_line['content']}")
|
||||
|
||||
formatted_output = "\n".join(formatted_lines)
|
||||
|
||||
return {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": formatted_output
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def regex_grep_count(pattern: str, file_paths: List[str],
|
||||
case_sensitive: bool = False) -> Dict[str, Any]:
|
||||
"""使用正则表达式统计匹配数量"""
|
||||
if not pattern:
|
||||
return {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Error: Pattern cannot be empty"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
if not file_paths:
|
||||
return {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Error: File path list cannot be empty"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# 编译正则表达式
|
||||
try:
|
||||
flags = 0 if case_sensitive else re.IGNORECASE
|
||||
compiled_pattern = re.compile(pattern, flags)
|
||||
except re.error as e:
|
||||
return {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"Error: Invalid regular expression '{pattern}': {str(e)}"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# 处理项目目录限制
|
||||
project_data_dir = get_allowed_directory()
|
||||
|
||||
# 验证文件路径
|
||||
valid_paths = []
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
# 解析相对路径
|
||||
if not os.path.isabs(file_path):
|
||||
# 移除 projects/ 前缀(如果存在)
|
||||
clean_path = file_path
|
||||
if clean_path.startswith('projects/'):
|
||||
clean_path = clean_path[9:] # 移除 'projects/' 前缀
|
||||
elif clean_path.startswith('./projects/'):
|
||||
clean_path = clean_path[11:] # 移除 './projects/' 前缀
|
||||
|
||||
# 尝试在项目目录中查找文件
|
||||
full_path = os.path.join(project_data_dir, clean_path.lstrip('./'))
|
||||
if os.path.exists(full_path):
|
||||
valid_paths.append(full_path)
|
||||
else:
|
||||
# 如果直接路径不存在,尝试递归查找
|
||||
found = find_file_in_project(clean_path, project_data_dir)
|
||||
if found:
|
||||
valid_paths.append(found)
|
||||
else:
|
||||
if file_path.startswith(project_data_dir) and os.path.exists(file_path):
|
||||
valid_paths.append(file_path)
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
if not valid_paths:
|
||||
return {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"Error: Specified files not found in project directory {project_data_dir}"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# 统计匹配结果
|
||||
total_matches = 0
|
||||
total_lines_with_matches = 0
|
||||
file_stats = {}
|
||||
|
||||
for file_path in valid_paths:
|
||||
try:
|
||||
matches, lines_with_matches = regex_count_in_file(file_path, compiled_pattern, case_sensitive)
|
||||
total_matches += matches
|
||||
total_lines_with_matches += lines_with_matches
|
||||
|
||||
file_name = os.path.basename(file_path)
|
||||
file_stats[file_name] = {
|
||||
'matches': matches,
|
||||
'lines_with_matches': lines_with_matches
|
||||
}
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
# 格式化输出
|
||||
formatted_lines = []
|
||||
formatted_lines.append("=== Regex Match Statistics ===")
|
||||
formatted_lines.append(f"Pattern: {pattern}")
|
||||
formatted_lines.append(f"Files searched: {len(valid_paths)}")
|
||||
formatted_lines.append(f"Total matches: {total_matches}")
|
||||
formatted_lines.append(f"Total lines with matches: {total_lines_with_matches}")
|
||||
formatted_lines.append("")
|
||||
|
||||
# 按文件统计
|
||||
formatted_lines.append("=== Statistics by File ===")
|
||||
for file_name, stats in sorted(file_stats.items()):
|
||||
formatted_lines.append(f"File: {file_name}")
|
||||
formatted_lines.append(f" Matches: {stats['matches']}")
|
||||
formatted_lines.append(f" Lines with matches: {stats['lines_with_matches']}")
|
||||
formatted_lines.append("")
|
||||
|
||||
formatted_output = "\n".join(formatted_lines)
|
||||
|
||||
return {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": formatted_output
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def regex_search_in_file(file_path: str, pattern: re.Pattern,
|
||||
context_lines: int, case_sensitive: bool) -> List[Dict[str, Any]]:
|
||||
"""在单个文件中搜索正则表达式,支持上下文"""
|
||||
results = []
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
lines = f.readlines()
|
||||
except Exception as e:
|
||||
return results
|
||||
|
||||
for line_number, line in enumerate(lines, 1):
|
||||
line_content = line.rstrip('\n\r')
|
||||
|
||||
# 搜索匹配
|
||||
matches = list(pattern.finditer(line_content))
|
||||
if matches:
|
||||
# 准备上下文
|
||||
context_before = []
|
||||
context_after = []
|
||||
|
||||
if context_lines > 0:
|
||||
# 获取前面的上下文
|
||||
start_line = max(0, line_number - 1 - context_lines)
|
||||
for i in range(start_line, line_number - 1):
|
||||
if i < len(lines):
|
||||
context_before.append({
|
||||
'line_number': i + 1,
|
||||
'content': lines[i].rstrip('\n\r')
|
||||
})
|
||||
|
||||
# 获取后面的上下文
|
||||
end_line = min(len(lines), line_number + context_lines)
|
||||
for i in range(line_number, end_line):
|
||||
if i < len(lines):
|
||||
context_after.append({
|
||||
'line_number': i + 1,
|
||||
'content': lines[i].rstrip('\n\r')
|
||||
})
|
||||
|
||||
# 为每个匹配创建结果
|
||||
for match in matches:
|
||||
result = {
|
||||
'file_path': file_path,
|
||||
'match_line_number': line_number,
|
||||
'match_text': line_content,
|
||||
'matched_content': match.group(0),
|
||||
'start_pos': match.start(),
|
||||
'end_pos': match.end()
|
||||
}
|
||||
|
||||
if context_before:
|
||||
result['context_before'] = context_before
|
||||
|
||||
if context_after:
|
||||
result['context_after'] = context_after
|
||||
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def regex_count_in_file(file_path: str, pattern: re.Pattern,
|
||||
case_sensitive: bool) -> tuple[int, int]:
|
||||
"""统计文件中的匹配数量"""
|
||||
total_matches = 0
|
||||
lines_with_matches = 0
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
lines = f.readlines()
|
||||
except Exception as e:
|
||||
return total_matches, lines_with_matches
|
||||
|
||||
for line_number, line in enumerate(lines, 1):
|
||||
line_content = line.rstrip('\n\r')
|
||||
|
||||
# 搜索匹配
|
||||
matches = list(pattern.finditer(line_content))
|
||||
if matches:
|
||||
total_matches += len(matches)
|
||||
lines_with_matches += 1
|
||||
|
||||
return total_matches, lines_with_matches
|
||||
|
||||
|
||||
async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Handle MCP request"""
|
||||
try:
|
||||
@ -685,6 +1048,34 @@ async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"result": result
|
||||
}
|
||||
|
||||
elif tool_name == "regex_grep":
|
||||
pattern = arguments.get("pattern", "")
|
||||
file_paths = arguments.get("file_paths", [])
|
||||
context_lines = arguments.get("context_lines", 0)
|
||||
case_sensitive = arguments.get("case_sensitive", False)
|
||||
limit = arguments.get("limit", 50)
|
||||
|
||||
result = regex_grep(pattern, file_paths, context_lines, case_sensitive, limit)
|
||||
|
||||
return {
|
||||
"jsonrpc": "2.0",
|
||||
"id": request_id,
|
||||
"result": result
|
||||
}
|
||||
|
||||
elif tool_name == "regex_grep_count":
|
||||
pattern = arguments.get("pattern", "")
|
||||
file_paths = arguments.get("file_paths", [])
|
||||
case_sensitive = arguments.get("case_sensitive", False)
|
||||
|
||||
result = regex_grep_count(pattern, file_paths, case_sensitive)
|
||||
|
||||
return {
|
||||
"jsonrpc": "2.0",
|
||||
"id": request_id,
|
||||
"result": result
|
||||
}
|
||||
|
||||
else:
|
||||
return {
|
||||
"jsonrpc": "2.0",
|
||||
|
||||
@ -45,5 +45,75 @@
|
||||
"file_paths"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "regex_grep",
|
||||
"description": "**Regex Pattern Search**: Search files using regular expressions with context lines support.\n\n**Core Features**:\n- Pure regex pattern matching without weight requirements\n- Context lines support for showing surrounding code\n- Case-sensitive/insensitive search options\n- File grouping in results for better organization\n\n**Parameters**:\n- **pattern**: Regular expression pattern to search for\n- **file_paths**: List of files to search in\n- **context_lines**: Number of lines before and after each match (default: 0)\n- **case_sensitive**: Whether to match case (default: false)\n- **limit**: Maximum number of matches to return (default: 50)\n\n**Use Cases**:\n- Pattern-based code search when you know the exact regex\n- Finding function definitions, class declarations, imports\n- Searching for specific code patterns or structures\n- Context-aware search when you need surrounding lines\n- Debugging and code navigation\n\n**Output Format**:\n- Shows total matches found\n- Groups results by file\n- Displays line numbers with matched content\n- Includes context lines when specified",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"description": "Regular expression pattern to search for"
|
||||
},
|
||||
"file_paths": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "List of file paths to search"
|
||||
},
|
||||
"context_lines": {
|
||||
"type": "integer",
|
||||
"description": "Number of context lines before and after each match",
|
||||
"default": 0,
|
||||
"minimum": 0
|
||||
},
|
||||
"case_sensitive": {
|
||||
"type": "boolean",
|
||||
"description": "Whether to distinguish case sensitivity",
|
||||
"default": false
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of matches to return",
|
||||
"default": 50,
|
||||
"minimum": 1
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"pattern",
|
||||
"file_paths"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "regex_grep_count",
|
||||
"description": "**Regex Match Statistics**: Count regex pattern matches across files without returning actual content.\n\n**Core Features**:\n- Pure regex pattern counting without weight requirements\n- Comprehensive match statistics per file\n- Total match and line counts\n- Case-sensitive/insensitive search options\n\n**Parameters**:\n- **pattern**: Regular expression pattern to search for\n- **file_paths**: List of files to search in\n- **case_sensitive**: Whether to match case (default: false)\n\n**Use Cases**:\n- Quick assessment of pattern prevalence across codebase\n- Counting occurrences of specific functions, variables, or patterns\n- Measuring code complexity or usage statistics\n- Pre-search analysis to understand scope\n- Quality metrics and code analysis\n\n**Output Format**:\n- Summary statistics with total matches and files searched\n- Per-file breakdown with match counts and lines affected\n- Clear formatting for easy analysis and reporting",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"description": "Regular expression pattern to search for"
|
||||
},
|
||||
"file_paths": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "List of file paths to search"
|
||||
},
|
||||
"case_sensitive": {
|
||||
"type": "boolean",
|
||||
"description": "Whether to distinguish case sensitivity",
|
||||
"default": false
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"pattern",
|
||||
"file_paths"
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
- 纯文本文档(document.txt)
|
||||
- 原始markdown文本内容,可提供数据的完整上下文信息,内容检索困难。
|
||||
- 获取检索某一行数据的时候,需要包含行的前后10行的上下文才有意义,单行内容简短且没有意义。
|
||||
- 请在必要的时候使用`multi_keyword-regex_grep`工具,带contextLines 参数来调阅document.txt上下文文件。
|
||||
- 分页数据层 (pagination.txt):
|
||||
- 单行内容代表完整的一页数据,无需读取前后行的上下文, 前后行的数据对应上下页的内容,适合一次获取全部资料的场景。
|
||||
- 正则和关键词的主要检索文件, 请先基于这个文件检索到关键信息再去调阅document.txt
|
||||
@ -33,9 +34,13 @@
|
||||
### 问题分析
|
||||
1. **问题分析**:分析问题,整理出可能涉及检索的关键词,为下一步做准备
|
||||
2. **关键词提取**:构思并生成需要检索的核心关键词。下一步需要基于这些关键词进行关键词扩展操作。
|
||||
3. **数字关键词扩展**:
|
||||
3. **数据预览**:对于价格、重量、长度等存在数字的内容,可以多次调用`multi_keyword-regex_grep`对`document.txt`的内容进行数据模式预览,这样返回的数据量少,为下一步的关键词扩展提供数据支撑。
|
||||
|
||||
### 关键词扩展
|
||||
4. **关键词扩展**:基于召回的内容扩展和优化需要检索的关键词,需要尽量丰富的关键词这对多关键词检索很重要。
|
||||
5. **数字扩展**:
|
||||
a. **单位标准化扩展**:
|
||||
- 重量:1千克 → 1000g, 1kg, 1.0kg, 1000.0g, 1公斤
|
||||
- 重量:1千克 → 1000g, 1kg, 1.0kg, 1000.0g, 1公斤,0.99kg
|
||||
- 长度:3米 → 3m, 3.0m, 30cm, 300厘米
|
||||
- 货币:¥9.99 → 9.99元, 9.99元, ¥9.99, 九点九九元
|
||||
- 时间:2小时 → 120分钟, 7200秒, 2h, 2.0小时, 两小时
|
||||
@ -51,34 +56,97 @@
|
||||
- 百分比:25% → 0.25, 百分之二十五
|
||||
- 时间:7天 → 7日, 一周, 168小时
|
||||
|
||||
d. **范围性扩展**(适度):
|
||||
- 重量:1kg → 900g, 990g, 0.99kg, 1200kg,
|
||||
- 长度:3 meters → 2.8m, 3.5m, 28cm, 290 centimeters.
|
||||
- 价格:100元 → 90元, 95元, 105元, 110元
|
||||
- 时间:7天 → 5天, 6天, 8天, 10天
|
||||
d. **范围性扩展(适度)**: 从自然语言的语义中理解其表达的数量范围,然后将这个范围转化为可匹配文本模式的正则表达式。
|
||||
** 1. 重量**
|
||||
- **案例1:模糊精确值**
|
||||
- **语义**:`大约1kg/1000g左右`
|
||||
- **范围理解**:允许一个上下浮动的区间,例如 ±20%,即 800g 到 1200g。
|
||||
- **正则表达式**:`/([01]\.\d+\s*[kK]?[gG]|(8\d{2}|9\d{2}|1[01]\d{2}|1200)\s*[gG])/`
|
||||
- **解释**:
|
||||
- `[01]\.\d+\s*[kK]?[gG]`:匹配 `0.8` 到 `1.2` 之间的千克数(如 `0.95 kg`, `1.2kg`)。
|
||||
- `(8\d{2}|9\d{2}|1[01]\d{2}|1200)\s*[gG]`:匹配 `800` 到 `1200` 之间的克数。
|
||||
|
||||
e. **正则范围扩展**(重要):
|
||||
- 根据上文扩展的数字关键词,生成范围检索的正则表达式,检索效果更好。
|
||||
- 重量:1kg/1000g/800g-1200g → /[01].\d+\s*kg|(8\d{2}|9\d{2}|1[01]\d{2}|1200)\s*g/
|
||||
- 长度:3m/3.0m → /3\s*m|3.\d+\s*m/
|
||||
### 关键词扩展
|
||||
4. **数据预览**:
|
||||
- **数字内容正则检索**:对于价格、重量、长度等存在数字的内容,推荐优先调用`multi_keyword-search` 对`document.txt`的内容进行数据预览,这样返回的数据量少,为下一步的关键词扩展提供数据支撑。
|
||||
5. **关键词扩展**:基于召回的内容扩展和优化需要检索的关键词,需要尽量丰富的关键词这对多关键词检索很重要。
|
||||
- **案例2:上限值**
|
||||
- **语义**:`小于1kg`
|
||||
- **范围理解**:从很小的值(如1g)到接近1kg的值(如800g或999g),不包括1kg本身。
|
||||
- **正则表达式**:`/\b([1-9]\d{0,2}\s*[gG]|0?\.\d+\s*[kK]?[gG])\b/`
|
||||
- **解释**:
|
||||
- `[1-9]\d{0,2}\s*[gG]`:匹配 1-999 克。
|
||||
- `0?\.\d+\s*[kK]?[gG]`:匹配 0.1kg, .5kg 等小于1的千克数。
|
||||
|
||||
** 2. 长度**
|
||||
- **案例1:近似值**
|
||||
- **语义**:`3米`
|
||||
- **范围理解**:可能表示一个近似值,范围在 2.5米 到 3.5米 之间。
|
||||
- **正则表达式**:`/\b([2-3]\.\d+\s*[mM]|2\.5|3\.5)\b/`
|
||||
- **解释**:匹配 `2.5` 到 `3.5` 之间的米数。
|
||||
|
||||
- **案例2:上限值**
|
||||
- **语义**:`小于3米`
|
||||
- **范围理解**:从很小的值(如0.1m)到接近3米的值(如2.9m)。
|
||||
- **正则表达式**:`/\b([0-2]\.\d+\s*[mM]|[12]?\d{1,2}\s*[cC][mM])\b/`
|
||||
- **解释**:
|
||||
- `[0-2]\.\d+\s*[mM]`:匹配 0.0 到 2.9 米。
|
||||
- `[12]?\d{1,2}\s*[cC][mM]`:同时匹配可能用厘米表示的情况,如 50cm, 150cm, 299cm。
|
||||
|
||||
** 3. 价格**
|
||||
- **案例1:基准价格**
|
||||
- **语义**:`100元`
|
||||
- **范围理解**:可能是一个参考价,上下浮动10元,即90元到110元。
|
||||
- **正则表达式**:`/\b(9[0-9]|10[0-9]|110)\s*元?\b/`
|
||||
- **解释**:匹配 `90` 到 `110` 之间的整数,后面跟着“元”字。
|
||||
|
||||
- **案例2:价格区间**
|
||||
- **语义**:`100到200元之间`
|
||||
- **范围理解**:明确的价格区间。
|
||||
- **正则表达式**:`/\b(1[0-9]{2})\s*元?\b/`
|
||||
- **解释**:匹配 `100` 到 `199` 之间的整数。如果需要更精确到200,可写为 `(1[0-9]{2}|200)`。
|
||||
|
||||
** 4. 时间**
|
||||
- **案例1:近似时长**
|
||||
- **语义**:`7天`
|
||||
- **范围理解**:可能前后浮动几天,例如5到10天。
|
||||
- **正则表达式**:`/\b([5-9]|10)\s*天?\b/`
|
||||
- **解释**:匹配 `5`, `6`, `7`, `8`, `9`, `10` 这些数字加上“天”字。
|
||||
|
||||
- **案例2:超过某个时间**
|
||||
- **语义**:`大于一周`
|
||||
- **范围理解**:8天及以上,或者8天到一个月(30天)。
|
||||
- **正则表达式**:`/\b([8-9]|[12][0-9]|30)\s*天?\b/`
|
||||
- **解释**:匹配 `8` 到 `30` 天。
|
||||
|
||||
** 5. 温度**
|
||||
- **案例1:舒适温度**
|
||||
- **语义**:`室温(约25摄氏度)`
|
||||
- **范围理解**:通常指20°C到30°C。
|
||||
- **正则表达式**:`/\b(2[0-9]|30)\s*°?[Cc]\b/`
|
||||
- **解释**:匹配 `20` 到 `30` 之间的整数,后跟 `C` 或 `°C`。
|
||||
|
||||
- **案例2:高温**
|
||||
- **语义**:`零度以下`
|
||||
- **范围理解**:任何小于0°C的温度。
|
||||
- **正则表达式**:`/\b-?[1-9]\d*\s*°?[Cc]\b/`
|
||||
- **注意**:这个正则较简单,实际应用需考虑负数匹配的精确性。
|
||||
|
||||
** 6. 百分比**
|
||||
- **案例1:高浓度**
|
||||
- **语义**:`浓度很高(超过90%)`
|
||||
- **范围理解**:90% 到 100%。
|
||||
- **正则表达式**:`/\b(9[0-9]|100)\s*%?\b/`
|
||||
- **解释**:匹配 `90` 到 `100` 之间的整数,后跟可选的 `%` 符号。
|
||||
|
||||
- **案例2:半数以上**
|
||||
- **语义**:`大部分`
|
||||
- **范围理解**:可以理解为 50% 到 90%。
|
||||
- **正则表达式**:`/\b([5-8][0-9]|90)\s*%?\b/`
|
||||
- **解释**:匹配 `50` 到 `90` 之间的整数。
|
||||
|
||||
### 策略制定
|
||||
6. **路径选择**:根据查询复杂度选择最优搜索路径
|
||||
- **策略原则**:优先简单字段匹配,避免复杂正则表达式
|
||||
- **优化思路**:使用宽松匹配 + 后处理筛选,提高召回率
|
||||
|
||||
|
||||
### 执行与验证
|
||||
7. **搜索执行**:必须使用`multi_keyword-search`执行全面的多关键词+正则混合检索,没有执行这个步骤不要给出最终的答案。
|
||||
8. **交叉验证**:使用关键词在`document.txt`文件执行上下文查询获取前后20行内容进行参考。
|
||||
- 通过多角度搜索确保结果完整性
|
||||
- 使用不同关键词组合
|
||||
- 尝试多种查询模式
|
||||
- 在不同数据层间验证
|
||||
7. **规模预估**:调用`multi_keyword-regex_grep_count`评估搜索结果规模,避免数据过载
|
||||
8. **搜索执行**:给出最终回答之前,必须使用`multi_keyword-search`执行多关键词权重的混合检索。
|
||||
|
||||
## 高级搜索策略
|
||||
|
||||
|
||||
@ -13,9 +13,9 @@ mkdir -p /app/queue_data
|
||||
# 等待一下确保目录创建完成
|
||||
sleep 1
|
||||
|
||||
echo "Starting FastAPI application..."
|
||||
echo "Starting FastAPI application with uvicorn..."
|
||||
# 在后台启动FastAPI应用
|
||||
python fastapi_app.py &
|
||||
uvicorn fastapi_app:app --host 0.0.0.0 --port 8000 &
|
||||
|
||||
echo "Starting queue consumer..."
|
||||
# 在后台启动队列消费者
|
||||
|
||||
Loading…
Reference in New Issue
Block a user