add regex grep

This commit is contained in:
朱潮 2025-10-22 19:02:42 +08:00
parent 8cf7f956d6
commit 3591d8228e
6 changed files with 564 additions and 33 deletions

View File

@ -19,21 +19,21 @@ RUN sed -i 's|http://deb.debian.org|http://mirrors.aliyun.com|g' /etc/apt/source
COPY requirements.txt .
RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt
# 复制应用代码
COPY . .
# 安装modelscope
RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ modelscope
# 创建必要的目录
RUN mkdir -p /app/projects
RUN mkdir -p /app/projects
RUN mkdir -p /app/public
RUN mkdir -p /app/models
RUN mkdir -p /app/queue_data
# 安装modelscope
RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ modelscope
# 从modelscope下载sentence-transformers模型到models目录
RUN python -c "from modelscope import snapshot_download; model_dir = snapshot_download('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'); import shutil; shutil.move(model_dir, '/app/models/paraphrase-multilingual-MiniLM-L12-v2')"
# 复制应用代码
COPY . .
# 暴露端口
EXPOSE 8001

View File

@ -2,7 +2,9 @@ version: "3.8"
services:
qwen-agent:
build: .
build:
context: .
dockerfile: Dockerfile.modelscope
container_name: qwen-agent-api
ports:
- "8001:8001"

View File

@ -611,6 +611,369 @@ def find_file_in_project(filename: str, project_dir: str) -> Optional[str]:
return None
def regex_grep(pattern: str, file_paths: List[str], context_lines: int = 0,
case_sensitive: bool = False, limit: int = 50) -> Dict[str, Any]:
"""使用正则表达式搜索文件内容,支持上下文行"""
if not pattern:
return {
"content": [
{
"type": "text",
"text": "Error: Pattern cannot be empty"
}
]
}
if not file_paths:
return {
"content": [
{
"type": "text",
"text": "Error: File path list cannot be empty"
}
]
}
# 编译正则表达式
try:
flags = 0 if case_sensitive else re.IGNORECASE
compiled_pattern = re.compile(pattern, flags)
except re.error as e:
return {
"content": [
{
"type": "text",
"text": f"Error: Invalid regular expression '{pattern}': {str(e)}"
}
]
}
# 处理项目目录限制
project_data_dir = get_allowed_directory()
# 验证文件路径
valid_paths = []
for file_path in file_paths:
try:
# 解析相对路径
if not os.path.isabs(file_path):
# 移除 projects/ 前缀(如果存在)
clean_path = file_path
if clean_path.startswith('projects/'):
clean_path = clean_path[9:] # 移除 'projects/' 前缀
elif clean_path.startswith('./projects/'):
clean_path = clean_path[11:] # 移除 './projects/' 前缀
# 尝试在项目目录中查找文件
full_path = os.path.join(project_data_dir, clean_path.lstrip('./'))
if os.path.exists(full_path):
valid_paths.append(full_path)
else:
# 如果直接路径不存在,尝试递归查找
found = find_file_in_project(clean_path, project_data_dir)
if found:
valid_paths.append(found)
else:
if file_path.startswith(project_data_dir) and os.path.exists(file_path):
valid_paths.append(file_path)
except Exception as e:
continue
if not valid_paths:
return {
"content": [
{
"type": "text",
"text": f"Error: Specified files not found in project directory {project_data_dir}"
}
]
}
# 收集所有匹配结果
all_results = []
for file_path in valid_paths:
try:
results = regex_search_in_file(file_path, compiled_pattern, context_lines, case_sensitive)
all_results.extend(results)
except Exception as e:
continue
# 按文件路径和行号排序
all_results.sort(key=lambda x: (x['file_path'], x['match_line_number']))
# 限制结果数量
limited_results = all_results[:limit]
# 格式化输出
if not limited_results:
return {
"content": [
{
"type": "text",
"text": "No matches found"
}
]
}
# 格式化输出
formatted_lines = []
# 显示总匹配数量
total_matches = len(all_results)
showing_count = len(limited_results)
summary_line = f"Found {total_matches} matches, showing top {showing_count} results:"
formatted_lines.append(summary_line)
# 按文件分组显示结果
current_file = None
for result in limited_results:
file_path = result['file_path']
if file_path != current_file:
current_file = file_path
file_name = os.path.basename(file_path)
formatted_lines.append(f"\n--- File: {file_name} ---")
match_line = result['match_line_number']
match_text = result['match_text']
matched_content = result['matched_content']
# 显示匹配行
formatted_lines.append(f"{match_line}:{matched_content}")
# 显示上下文行
if 'context_before' in result:
for context_line in result['context_before']:
formatted_lines.append(f"{context_line['line_number']}:{context_line['content']}")
if 'context_after' in result:
for context_line in result['context_after']:
formatted_lines.append(f"{context_line['line_number']}:{context_line['content']}")
formatted_output = "\n".join(formatted_lines)
return {
"content": [
{
"type": "text",
"text": formatted_output
}
]
}
def regex_grep_count(pattern: str, file_paths: List[str],
case_sensitive: bool = False) -> Dict[str, Any]:
"""使用正则表达式统计匹配数量"""
if not pattern:
return {
"content": [
{
"type": "text",
"text": "Error: Pattern cannot be empty"
}
]
}
if not file_paths:
return {
"content": [
{
"type": "text",
"text": "Error: File path list cannot be empty"
}
]
}
# 编译正则表达式
try:
flags = 0 if case_sensitive else re.IGNORECASE
compiled_pattern = re.compile(pattern, flags)
except re.error as e:
return {
"content": [
{
"type": "text",
"text": f"Error: Invalid regular expression '{pattern}': {str(e)}"
}
]
}
# 处理项目目录限制
project_data_dir = get_allowed_directory()
# 验证文件路径
valid_paths = []
for file_path in file_paths:
try:
# 解析相对路径
if not os.path.isabs(file_path):
# 移除 projects/ 前缀(如果存在)
clean_path = file_path
if clean_path.startswith('projects/'):
clean_path = clean_path[9:] # 移除 'projects/' 前缀
elif clean_path.startswith('./projects/'):
clean_path = clean_path[11:] # 移除 './projects/' 前缀
# 尝试在项目目录中查找文件
full_path = os.path.join(project_data_dir, clean_path.lstrip('./'))
if os.path.exists(full_path):
valid_paths.append(full_path)
else:
# 如果直接路径不存在,尝试递归查找
found = find_file_in_project(clean_path, project_data_dir)
if found:
valid_paths.append(found)
else:
if file_path.startswith(project_data_dir) and os.path.exists(file_path):
valid_paths.append(file_path)
except Exception as e:
continue
if not valid_paths:
return {
"content": [
{
"type": "text",
"text": f"Error: Specified files not found in project directory {project_data_dir}"
}
]
}
# 统计匹配结果
total_matches = 0
total_lines_with_matches = 0
file_stats = {}
for file_path in valid_paths:
try:
matches, lines_with_matches = regex_count_in_file(file_path, compiled_pattern, case_sensitive)
total_matches += matches
total_lines_with_matches += lines_with_matches
file_name = os.path.basename(file_path)
file_stats[file_name] = {
'matches': matches,
'lines_with_matches': lines_with_matches
}
except Exception as e:
continue
# 格式化输出
formatted_lines = []
formatted_lines.append("=== Regex Match Statistics ===")
formatted_lines.append(f"Pattern: {pattern}")
formatted_lines.append(f"Files searched: {len(valid_paths)}")
formatted_lines.append(f"Total matches: {total_matches}")
formatted_lines.append(f"Total lines with matches: {total_lines_with_matches}")
formatted_lines.append("")
# 按文件统计
formatted_lines.append("=== Statistics by File ===")
for file_name, stats in sorted(file_stats.items()):
formatted_lines.append(f"File: {file_name}")
formatted_lines.append(f" Matches: {stats['matches']}")
formatted_lines.append(f" Lines with matches: {stats['lines_with_matches']}")
formatted_lines.append("")
formatted_output = "\n".join(formatted_lines)
return {
"content": [
{
"type": "text",
"text": formatted_output
}
]
}
def regex_search_in_file(file_path: str, pattern: re.Pattern,
context_lines: int, case_sensitive: bool) -> List[Dict[str, Any]]:
"""在单个文件中搜索正则表达式,支持上下文"""
results = []
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
except Exception as e:
return results
for line_number, line in enumerate(lines, 1):
line_content = line.rstrip('\n\r')
# 搜索匹配
matches = list(pattern.finditer(line_content))
if matches:
# 准备上下文
context_before = []
context_after = []
if context_lines > 0:
# 获取前面的上下文
start_line = max(0, line_number - 1 - context_lines)
for i in range(start_line, line_number - 1):
if i < len(lines):
context_before.append({
'line_number': i + 1,
'content': lines[i].rstrip('\n\r')
})
# 获取后面的上下文
end_line = min(len(lines), line_number + context_lines)
for i in range(line_number, end_line):
if i < len(lines):
context_after.append({
'line_number': i + 1,
'content': lines[i].rstrip('\n\r')
})
# 为每个匹配创建结果
for match in matches:
result = {
'file_path': file_path,
'match_line_number': line_number,
'match_text': line_content,
'matched_content': match.group(0),
'start_pos': match.start(),
'end_pos': match.end()
}
if context_before:
result['context_before'] = context_before
if context_after:
result['context_after'] = context_after
results.append(result)
return results
def regex_count_in_file(file_path: str, pattern: re.Pattern,
case_sensitive: bool) -> tuple[int, int]:
"""统计文件中的匹配数量"""
total_matches = 0
lines_with_matches = 0
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
except Exception as e:
return total_matches, lines_with_matches
for line_number, line in enumerate(lines, 1):
line_content = line.rstrip('\n\r')
# 搜索匹配
matches = list(pattern.finditer(line_content))
if matches:
total_matches += len(matches)
lines_with_matches += 1
return total_matches, lines_with_matches
async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]:
"""Handle MCP request"""
try:
@ -685,6 +1048,34 @@ async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]:
"result": result
}
elif tool_name == "regex_grep":
pattern = arguments.get("pattern", "")
file_paths = arguments.get("file_paths", [])
context_lines = arguments.get("context_lines", 0)
case_sensitive = arguments.get("case_sensitive", False)
limit = arguments.get("limit", 50)
result = regex_grep(pattern, file_paths, context_lines, case_sensitive, limit)
return {
"jsonrpc": "2.0",
"id": request_id,
"result": result
}
elif tool_name == "regex_grep_count":
pattern = arguments.get("pattern", "")
file_paths = arguments.get("file_paths", [])
case_sensitive = arguments.get("case_sensitive", False)
result = regex_grep_count(pattern, file_paths, case_sensitive)
return {
"jsonrpc": "2.0",
"id": request_id,
"result": result
}
else:
return {
"jsonrpc": "2.0",

View File

@ -45,5 +45,75 @@
"file_paths"
]
}
},
{
"name": "regex_grep",
"description": "**Regex Pattern Search**: Search files using regular expressions with context lines support.\n\n**Core Features**:\n- Pure regex pattern matching without weight requirements\n- Context lines support for showing surrounding code\n- Case-sensitive/insensitive search options\n- File grouping in results for better organization\n\n**Parameters**:\n- **pattern**: Regular expression pattern to search for\n- **file_paths**: List of files to search in\n- **context_lines**: Number of lines before and after each match (default: 0)\n- **case_sensitive**: Whether to match case (default: false)\n- **limit**: Maximum number of matches to return (default: 50)\n\n**Use Cases**:\n- Pattern-based code search when you know the exact regex\n- Finding function definitions, class declarations, imports\n- Searching for specific code patterns or structures\n- Context-aware search when you need surrounding lines\n- Debugging and code navigation\n\n**Output Format**:\n- Shows total matches found\n- Groups results by file\n- Displays line numbers with matched content\n- Includes context lines when specified",
"inputSchema": {
"type": "object",
"properties": {
"pattern": {
"type": "string",
"description": "Regular expression pattern to search for"
},
"file_paths": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of file paths to search"
},
"context_lines": {
"type": "integer",
"description": "Number of context lines before and after each match",
"default": 0,
"minimum": 0
},
"case_sensitive": {
"type": "boolean",
"description": "Whether to distinguish case sensitivity",
"default": false
},
"limit": {
"type": "integer",
"description": "Maximum number of matches to return",
"default": 50,
"minimum": 1
}
},
"required": [
"pattern",
"file_paths"
]
}
},
{
"name": "regex_grep_count",
"description": "**Regex Match Statistics**: Count regex pattern matches across files without returning actual content.\n\n**Core Features**:\n- Pure regex pattern counting without weight requirements\n- Comprehensive match statistics per file\n- Total match and line counts\n- Case-sensitive/insensitive search options\n\n**Parameters**:\n- **pattern**: Regular expression pattern to search for\n- **file_paths**: List of files to search in\n- **case_sensitive**: Whether to match case (default: false)\n\n**Use Cases**:\n- Quick assessment of pattern prevalence across codebase\n- Counting occurrences of specific functions, variables, or patterns\n- Measuring code complexity or usage statistics\n- Pre-search analysis to understand scope\n- Quality metrics and code analysis\n\n**Output Format**:\n- Summary statistics with total matches and files searched\n- Per-file breakdown with match counts and lines affected\n- Clear formatting for easy analysis and reporting",
"inputSchema": {
"type": "object",
"properties": {
"pattern": {
"type": "string",
"description": "Regular expression pattern to search for"
},
"file_paths": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of file paths to search"
},
"case_sensitive": {
"type": "boolean",
"description": "Whether to distinguish case sensitivity",
"default": false
}
},
"required": [
"pattern",
"file_paths"
]
}
}
]

View File

@ -9,6 +9,7 @@
- 纯文本文档document.txt
- 原始markdown文本内容可提供数据的完整上下文信息内容检索困难。
- 获取检索某一行数据的时候需要包含行的前后10行的上下文才有意义单行内容简短且没有意义。
- 请在必要的时候使用`multi_keyword-regex_grep`工具带contextLines 参数来调阅document.txt上下文文件。
- 分页数据层 (pagination.txt)
- 单行内容代表完整的一页数据,无需读取前后行的上下文, 前后行的数据对应上下页的内容,适合一次获取全部资料的场景。
- 正则和关键词的主要检索文件, 请先基于这个文件检索到关键信息再去调阅document.txt
@ -33,9 +34,13 @@
### 问题分析
1. **问题分析**:分析问题,整理出可能涉及检索的关键词,为下一步做准备
2. **关键词提取**:构思并生成需要检索的核心关键词。下一步需要基于这些关键词进行关键词扩展操作。
3. **数字关键词扩展**
3. **数据预览**:对于价格、重量、长度等存在数字的内容,可以多次调用`multi_keyword-regex_grep`对`document.txt`的内容进行数据模式预览,这样返回的数据量少,为下一步的关键词扩展提供数据支撑。
### 关键词扩展
4. **关键词扩展**:基于召回的内容扩展和优化需要检索的关键词,需要尽量丰富的关键词这对多关键词检索很重要。
5. **数字扩展**
a. **单位标准化扩展**
- 重量1千克 → 1000g, 1kg, 1.0kg, 1000.0g, 1公斤
- 重量1千克 → 1000g, 1kg, 1.0kg, 1000.0g, 1公斤0.99kg
- 长度3米 → 3m, 3.0m, 30cm, 300厘米
- 货币¥9.99 → 9.99元, 9.99元, ¥9.99, 九点九九元
- 时间2小时 → 120分钟, 7200秒, 2h, 2.0小时, 两小时
@ -51,34 +56,97 @@
- 百分比25% → 0.25, 百分之二十五
- 时间7天 → 7日, 一周, 168小时
d. **范围性扩展**(适度):
- 重量1kg → 900g, 990g, 0.99kg, 1200kg
- 长度3 meters → 2.8m, 3.5m, 28cm, 290 centimeters.
- 价格100元 → 90元, 95元, 105元, 110元
- 时间7天 → 5天, 6天, 8天, 10天
d. **范围性扩展(适度)**: 从自然语言的语义中理解其表达的数量范围,然后将这个范围转化为可匹配文本模式的正则表达式。
** 1. 重量**
- **案例1模糊精确值**
- **语义**`大约1kg/1000g左右`
- **范围理解**:允许一个上下浮动的区间,例如 ±20%,即 800g 到 1200g。
- **正则表达式**`/([01]\.\d+\s*[kK]?[gG]|(8\d{2}|9\d{2}|1[01]\d{2}|1200)\s*[gG])/`
- **解释**
- `[01]\.\d+\s*[kK]?[gG]`:匹配 `0.8``1.2` 之间的千克数(如 `0.95 kg`, `1.2kg`)。
- `(8\d{2}|9\d{2}|1[01]\d{2}|1200)\s*[gG]`:匹配 `800``1200` 之间的克数。
e. **正则范围扩展**(重要):
- 根据上文扩展的数字关键词,生成范围检索的正则表达式,检索效果更好。
- 重量1kg/1000g/800g-1200g → /[01].\d+\s*kg|(8\d{2}|9\d{2}|1[01]\d{2}|1200)\s*g/
- 长度3m/3.0m → /3\s*m|3.\d+\s*m/
### 关键词扩展
4. **数据预览**
**数字内容正则检索**:对于价格、重量、长度等存在数字的内容,推荐优先调用`multi_keyword-search` 对`document.txt`的内容进行数据预览,这样返回的数据量少,为下一步的关键词扩展提供数据支撑。
5. **关键词扩展**:基于召回的内容扩展和优化需要检索的关键词,需要尽量丰富的关键词这对多关键词检索很重要。
- **案例2上限值**
- **语义**`小于1kg`
- **范围理解**从很小的值如1g到接近1kg的值如800g或999g不包括1kg本身。
- **正则表达式**`/\b([1-9]\d{0,2}\s*[gG]|0?\.\d+\s*[kK]?[gG])\b/`
- **解释**
- `[1-9]\d{0,2}\s*[gG]`:匹配 1-999 克。
- `0?\.\d+\s*[kK]?[gG]`:匹配 0.1kg, .5kg 等小于1的千克数。
** 2. 长度**
- **案例1近似值**
- **语义**`3米`
- **范围理解**:可能表示一个近似值,范围在 2.5米 到 3.5米 之间。
- **正则表达式**`/\b([2-3]\.\d+\s*[mM]|2\.5|3\.5)\b/`
- **解释**:匹配 `2.5``3.5` 之间的米数。
- **案例2上限值**
- **语义**`小于3米`
- **范围理解**从很小的值如0.1m到接近3米的值如2.9m)。
- **正则表达式**`/\b([0-2]\.\d+\s*[mM]|[12]?\d{1,2}\s*[cC][mM])\b/`
- **解释**
- `[0-2]\.\d+\s*[mM]`:匹配 0.0 到 2.9 米。
- `[12]?\d{1,2}\s*[cC][mM]`:同时匹配可能用厘米表示的情况,如 50cm, 150cm, 299cm。
** 3. 价格**
- **案例1基准价格**
- **语义**`100元`
- **范围理解**可能是一个参考价上下浮动10元即90元到110元。
- **正则表达式**`/\b(9[0-9]|10[0-9]|110)\s*元?\b/`
- **解释**:匹配 `90``110` 之间的整数,后面跟着“元”字。
- **案例2价格区间**
- **语义**`100到200元之间`
- **范围理解**:明确的价格区间。
- **正则表达式**`/\b(1[0-9]{2})\s*元?\b/`
- **解释**:匹配 `100``199` 之间的整数。如果需要更精确到200可写为 `(1[0-9]{2}|200)`
** 4. 时间**
- **案例1近似时长**
- **语义**`7天`
- **范围理解**可能前后浮动几天例如5到10天。
- **正则表达式**`/\b([5-9]|10)\s*天?\b/`
- **解释**:匹配 `5`, `6`, `7`, `8`, `9`, `10` 这些数字加上“天”字。
- **案例2超过某个时间**
- **语义**`大于一周`
- **范围理解**8天及以上或者8天到一个月30天
- **正则表达式**`/\b([8-9]|[12][0-9]|30)\s*天?\b/`
- **解释**:匹配 `8``30` 天。
** 5. 温度**
- **案例1舒适温度**
- **语义**`室温约25摄氏度`
- **范围理解**通常指20°C到30°C。
- **正则表达式**`/\b(2[0-9]|30)\s*°?[Cc]\b/`
- **解释**:匹配 `20``30` 之间的整数,后跟 `C``°C`
- **案例2高温**
- **语义**`零度以下`
- **范围理解**任何小于0°C的温度。
- **正则表达式**`/\b-?[1-9]\d*\s*°?[Cc]\b/`
- **注意**:这个正则较简单,实际应用需考虑负数匹配的精确性。
** 6. 百分比**
- **案例1高浓度**
- **语义**`浓度很高超过90%`
- **范围理解**90% 到 100%。
- **正则表达式**`/\b(9[0-9]|100)\s*%?\b/`
- **解释**:匹配 `90``100` 之间的整数,后跟可选的 `%` 符号。
- **案例2半数以上**
- **语义**`大部分`
- **范围理解**:可以理解为 50% 到 90%。
- **正则表达式**`/\b([5-8][0-9]|90)\s*%?\b/`
- **解释**:匹配 `50``90` 之间的整数。
### 策略制定
6. **路径选择**:根据查询复杂度选择最优搜索路径
- **策略原则**:优先简单字段匹配,避免复杂正则表达式
- **优化思路**:使用宽松匹配 + 后处理筛选,提高召回率
### 执行与验证
7. **搜索执行**:必须使用`multi_keyword-search`执行全面的多关键词+正则混合检索,没有执行这个步骤不要给出最终的答案。
8. **交叉验证**:使用关键词在`document.txt`文件执行上下文查询获取前后20行内容进行参考。
- 通过多角度搜索确保结果完整性
- 使用不同关键词组合
- 尝试多种查询模式
- 在不同数据层间验证
7. **规模预估**:调用`multi_keyword-regex_grep_count`评估搜索结果规模,避免数据过载
8. **搜索执行**:给出最终回答之前,必须使用`multi_keyword-search`执行多关键词权重的混合检索。
## 高级搜索策略

View File

@ -13,9 +13,9 @@ mkdir -p /app/queue_data
# 等待一下确保目录创建完成
sleep 1
echo "Starting FastAPI application..."
echo "Starting FastAPI application with uvicorn..."
# 在后台启动FastAPI应用
python fastapi_app.py &
uvicorn fastapi_app:app --host 0.0.0.0 --port 8000 &
echo "Starting queue consumer..."
# 在后台启动队列消费者