add regex for multi search

This commit is contained in:
朱潮 2025-10-16 21:42:18 +08:00
parent d0e3e62291
commit 9d2735a53c

View File

@ -9,7 +9,8 @@ import json
import os import os
import sys import sys
import asyncio import asyncio
from typing import Any, Dict, List, Optional import re
from typing import Any, Dict, List, Optional, Union
def validate_file_path(file_path: str, allowed_dir: str) -> str: def validate_file_path(file_path: str, allowed_dir: str) -> str:
@ -38,9 +39,48 @@ def get_allowed_directory():
return os.path.abspath(project_dir) return os.path.abspath(project_dir)
def is_regex_pattern(pattern: str) -> bool:
"""检测字符串是否为正则表达式模式"""
# 检查 /pattern/ 格式
if pattern.startswith('/') and pattern.endswith('/') and len(pattern) > 2:
return True
# 检查 r"pattern" 或 r'pattern' 格式
if pattern.startswith(('r"', "r'")) and pattern.endswith(('"', "'")) and len(pattern) > 3:
return True
# 检查是否包含正则特殊字符
regex_chars = {'*', '+', '?', '|', '(', ')', '[', ']', '{', '}', '^', '$', '\\', '.'}
return any(char in pattern for char in regex_chars)
def compile_pattern(pattern: str) -> Union[re.Pattern, str, None]:
"""编译正则表达式模式,如果不是正则则返回原字符串"""
if not is_regex_pattern(pattern):
return pattern
try:
# 处理 /pattern/ 格式
if pattern.startswith('/') and pattern.endswith('/'):
regex_body = pattern[1:-1]
return re.compile(regex_body)
# 处理 r"pattern" 或 r'pattern' 格式
if pattern.startswith(('r"', "r'")) and pattern.endswith(('"', "'")):
regex_body = pattern[2:-1]
return re.compile(regex_body)
# 直接编译包含正则字符的字符串
return re.compile(pattern)
except re.error as e:
# 如果编译失败返回None表示无效的正则
print(f"警告: 正则表达式 '{pattern}' 编译失败: {e}")
return None
def multi_keyword_search(keywords: List[str], file_paths: List[str], def multi_keyword_search(keywords: List[str], file_paths: List[str],
limit: int = 10, case_sensitive: bool = False) -> Dict[str, Any]: limit: int = 10, case_sensitive: bool = False) -> Dict[str, Any]:
"""执行多关键词搜索""" """执行多关键词和正则表达式搜索"""
if not keywords: if not keywords:
return { return {
"content": [ "content": [
@ -61,6 +101,21 @@ def multi_keyword_search(keywords: List[str], file_paths: List[str],
] ]
} }
# 预处理和验证关键词中的正则表达式
valid_keywords = []
regex_errors = []
for keyword in keywords:
compiled = compile_pattern(keyword)
if compiled is None:
regex_errors.append(keyword)
else:
valid_keywords.append(keyword)
if regex_errors:
error_msg = f"警告: 以下正则表达式编译失败,将被忽略: {', '.join(regex_errors)}"
print(error_msg)
# 处理项目目录限制 # 处理项目目录限制
project_data_dir = get_allowed_directory() project_data_dir = get_allowed_directory()
@ -100,7 +155,7 @@ def multi_keyword_search(keywords: List[str], file_paths: List[str],
for file_path in valid_paths: for file_path in valid_paths:
try: try:
results = search_keywords_in_file(file_path, keywords, case_sensitive) results = search_keywords_in_file(file_path, valid_keywords, case_sensitive)
all_results.extend(results) all_results.extend(results)
except Exception as e: except Exception as e:
continue continue
@ -122,10 +177,24 @@ def multi_keyword_search(keywords: List[str], file_paths: List[str],
] ]
} }
formatted_output = "\n".join([ # 增强格式化输出,显示匹配类型和详细信息
f"{result['line_number']}:match_count({result['match_count']}):{result['content']}" formatted_lines = []
for result in limited_results for result in limited_results:
]) line_prefix = f"{result['line_number']}:match_count({result['match_count']}):"
# 构建匹配详情
match_details = []
for pattern in result['matched_patterns']:
if pattern['type'] == 'regex':
match_details.append(f"[regex:{pattern['original']}={pattern['match']}]")
else:
match_details.append(f"[keyword:{pattern['match']}]")
match_info = " ".join(match_details) if match_details else ""
formatted_line = f"{line_prefix}{match_info}:{result['content']}" if match_info else f"{line_prefix}{result['content']}"
formatted_lines.append(formatted_line)
formatted_output = "\n".join(formatted_lines)
return { return {
"content": [ "content": [
@ -139,7 +208,7 @@ def multi_keyword_search(keywords: List[str], file_paths: List[str],
def search_keywords_in_file(file_path: str, keywords: List[str], def search_keywords_in_file(file_path: str, keywords: List[str],
case_sensitive: bool) -> List[Dict[str, Any]]: case_sensitive: bool) -> List[Dict[str, Any]]:
"""搜索单个文件中的关键词""" """搜索单个文件中的关键词和正则表达式"""
results = [] results = []
try: try:
@ -148,27 +217,69 @@ def search_keywords_in_file(file_path: str, keywords: List[str],
except Exception as e: except Exception as e:
return results return results
# 准备关键词(如果不区分大小写) # 预处理所有模式
search_keywords = keywords if case_sensitive else [kw.lower() for kw in keywords] processed_patterns = []
for keyword in keywords:
compiled = compile_pattern(keyword)
if compiled is not None: # 跳过无效的正则表达式
processed_patterns.append({
'original': keyword,
'pattern': compiled,
'is_regex': isinstance(compiled, re.Pattern)
})
for line_number, line in enumerate(lines, 1): for line_number, line in enumerate(lines, 1):
line_content = line.rstrip('\n\r') line_content = line.rstrip('\n\r')
search_line = line_content if case_sensitive else line_content.lower() search_line = line_content if case_sensitive else line_content.lower()
# 统计匹配的关键词数量 # 统计匹配的模式数量
matched_keywords = [] matched_patterns = []
for i, keyword in enumerate(search_keywords): for pattern_info in processed_patterns:
if keyword in search_line: pattern = pattern_info['pattern']
matched_keywords.append(keywords[i]) # 使用原始关键词 is_regex = pattern_info['is_regex']
match_found = False
match_details = None
if is_regex:
# 正则表达式匹配
if case_sensitive:
match = pattern.search(line_content)
else:
# 对于不区分大小写的正则,需要重新编译
if isinstance(pattern, re.Pattern):
# 创建不区分大小写的版本
flags = pattern.flags | re.IGNORECASE
case_insensitive_pattern = re.compile(pattern.pattern, flags)
match = case_insensitive_pattern.search(line_content)
else:
match = pattern.search(search_line)
if match:
match_found = True
match_details = match.group(0)
else:
# 普通字符串匹配
search_keyword = pattern if case_sensitive else pattern.lower()
if search_keyword in search_line:
match_found = True
match_details = pattern
if match_found:
matched_patterns.append({
'original': pattern_info['original'],
'type': 'regex' if is_regex else 'keyword',
'match': match_details
})
match_count = len(matched_keywords) match_count = len(matched_patterns)
if match_count > 0: if match_count > 0:
results.append({ results.append({
'line_number': line_number, 'line_number': line_number,
'content': line_content, 'content': line_content,
'match_count': match_count, 'match_count': match_count,
'matched_keywords': matched_keywords, 'matched_patterns': matched_patterns,
'file_path': file_path 'file_path': file_path
}) })
@ -223,14 +334,14 @@ async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]:
"tools": [ "tools": [
{ {
"name": "multi_keyword_search", "name": "multi_keyword_search",
"description": "多关键词搜索工具,返回按匹配数量排序的结果。格式:[行号]:[匹配数量]:[行的原始内容]", "description": "智能关键词和正则表达式混合搜索工具,返回按匹配数量排序的结果。支持普通关键词和正则表达式混合使用。正则表达式支持格式:/pattern/、r\"pattern\"或包含正则特殊字符的字符串。结果格式:[行号]:[匹配数量]:[匹配信息]:[行的原始内容]",
"inputSchema": { "inputSchema": {
"type": "object", "type": "object",
"properties": { "properties": {
"keywords": { "keywords": {
"type": "array", "type": "array",
"items": {"type": "string"}, "items": {"type": "string"},
"description": "要搜索的关键词数组" "description": "要搜索的关键词和正则表达式数组。支持1)普通关键词 2)/pattern/格式正则 3)r\"pattern\"格式正则 4)包含正则特殊字符的字符串"
}, },
"file_paths": { "file_paths": {
"type": "array", "type": "array",