add regex grep

2025-10-22 19:02:42 +08:00 · 2025-10-22 19:02:42 +08:00 · 3591d8228e
commit 3591d8228e
parent 8cf7f956d6
6 changed files with 564 additions and 33 deletions
--- a/Dockerfile.modelscope
+++ b/Dockerfile.modelscope
@ -19,8 +19,8 @@ RUN sed -i 's|http://deb.debian.org|http://mirrors.aliyun.com|g' /etc/apt/source
 COPY requirements.txt .
 RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt

-# 复制应用代码
-COPY . .
+# 安装modelscope
+RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ modelscope

 # 创建必要的目录
 RUN mkdir -p /app/projects
@ -28,12 +28,12 @@ RUN mkdir -p /app/public
 RUN mkdir -p /app/models
 RUN mkdir -p /app/queue_data

-# 安装modelscope
-RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ modelscope
-
 # 从modelscope下载sentence-transformers模型到models目录
 RUN python -c "from modelscope import snapshot_download; model_dir = snapshot_download('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'); import shutil; shutil.move(model_dir, '/app/models/paraphrase-multilingual-MiniLM-L12-v2')"

+# 复制应用代码
+COPY . .
+
 # 暴露端口
 EXPOSE 8001

--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -2,7 +2,9 @@ version: "3.8"

 services:
  qwen-agent:
-    build: .
+    build:
+      context: .
+      dockerfile: Dockerfile.modelscope
    container_name: qwen-agent-api
    ports:
      - "8001:8001"
--- a/mcp/multi_keyword_search_server.py
+++ b/mcp/multi_keyword_search_server.py
@ -611,6 +611,369 @@ def find_file_in_project(filename: str, project_dir: str) -> Optional[str]:
    return None


+def regex_grep(pattern: str, file_paths: List[str], context_lines: int = 0, 
+              case_sensitive: bool = False, limit: int = 50) -> Dict[str, Any]:
+    """使用正则表达式搜索文件内容，支持上下文行"""
+    if not pattern:
+        return {
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Error: Pattern cannot be empty"
+                }
+            ]
+        }
+    
+    if not file_paths:
+        return {
+            "content": [
+                {
+                    "type": "text", 
+                    "text": "Error: File path list cannot be empty"
+                }
+            ]
+        }
+    
+    # 编译正则表达式
+    try:
+        flags = 0 if case_sensitive else re.IGNORECASE
+        compiled_pattern = re.compile(pattern, flags)
+    except re.error as e:
+        return {
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"Error: Invalid regular expression '{pattern}': {str(e)}"
+                }
+            ]
+        }
+    
+    # 处理项目目录限制
+    project_data_dir = get_allowed_directory()
+    
+    # 验证文件路径
+    valid_paths = []
+    for file_path in file_paths:
+        try:
+            # 解析相对路径
+            if not os.path.isabs(file_path):
+                # 移除 projects/ 前缀（如果存在）
+                clean_path = file_path
+                if clean_path.startswith('projects/'):
+                    clean_path = clean_path[9:]  # 移除 'projects/' 前缀
+                elif clean_path.startswith('./projects/'):
+                    clean_path = clean_path[11:]  # 移除 './projects/' 前缀
+                
+                # 尝试在项目目录中查找文件
+                full_path = os.path.join(project_data_dir, clean_path.lstrip('./'))
+                if os.path.exists(full_path):
+                    valid_paths.append(full_path)
+                else:
+                    # 如果直接路径不存在，尝试递归查找
+                    found = find_file_in_project(clean_path, project_data_dir)
+                    if found:
+                        valid_paths.append(found)
+            else:
+                if file_path.startswith(project_data_dir) and os.path.exists(file_path):
+                    valid_paths.append(file_path)
+        except Exception as e:
+            continue
+    
+    if not valid_paths:
+        return {
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"Error: Specified files not found in project directory {project_data_dir}"
+                }
+            ]
+        }
+    
+    # 收集所有匹配结果
+    all_results = []
+    
+    for file_path in valid_paths:
+        try:
+            results = regex_search_in_file(file_path, compiled_pattern, context_lines, case_sensitive)
+            all_results.extend(results)
+        except Exception as e:
+            continue
+    
+    # 按文件路径和行号排序
+    all_results.sort(key=lambda x: (x['file_path'], x['match_line_number']))
+    
+    # 限制结果数量
+    limited_results = all_results[:limit]
+    
+    # 格式化输出
+    if not limited_results:
+        return {
+            "content": [
+                {
+                    "type": "text",
+                    "text": "No matches found"
+                }
+            ]
+        }
+    
+    # 格式化输出
+    formatted_lines = []
+    
+    # 显示总匹配数量
+    total_matches = len(all_results)
+    showing_count = len(limited_results)
+    summary_line = f"Found {total_matches} matches, showing top {showing_count} results:"
+    formatted_lines.append(summary_line)
+    
+    # 按文件分组显示结果
+    current_file = None
+    for result in limited_results:
+        file_path = result['file_path']
+        if file_path != current_file:
+            current_file = file_path
+            file_name = os.path.basename(file_path)
+            formatted_lines.append(f"\n--- File: {file_name} ---")
+        
+        match_line = result['match_line_number']
+        match_text = result['match_text']
+        matched_content = result['matched_content']
+        
+        # 显示匹配行
+        formatted_lines.append(f"{match_line}:{matched_content}")
+        
+        # 显示上下文行
+        if 'context_before' in result:
+            for context_line in result['context_before']:
+                formatted_lines.append(f"{context_line['line_number']}:{context_line['content']}")
+        
+        if 'context_after' in result:
+            for context_line in result['context_after']:
+                formatted_lines.append(f"{context_line['line_number']}:{context_line['content']}")
+    
+    formatted_output = "\n".join(formatted_lines)
+    
+    return {
+        "content": [
+            {
+                "type": "text",
+                "text": formatted_output
+            }
+        ]
+    }
+
+
+def regex_grep_count(pattern: str, file_paths: List[str], 
+                    case_sensitive: bool = False) -> Dict[str, Any]:
+    """使用正则表达式统计匹配数量"""
+    if not pattern:
+        return {
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Error: Pattern cannot be empty"
+                }
+            ]
+        }
+    
+    if not file_paths:
+        return {
+            "content": [
+                {
+                    "type": "text", 
+                    "text": "Error: File path list cannot be empty"
+                }
+            ]
+        }
+    
+    # 编译正则表达式
+    try:
+        flags = 0 if case_sensitive else re.IGNORECASE
+        compiled_pattern = re.compile(pattern, flags)
+    except re.error as e:
+        return {
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"Error: Invalid regular expression '{pattern}': {str(e)}"
+                }
+            ]
+        }
+    
+    # 处理项目目录限制
+    project_data_dir = get_allowed_directory()
+    
+    # 验证文件路径
+    valid_paths = []
+    for file_path in file_paths:
+        try:
+            # 解析相对路径
+            if not os.path.isabs(file_path):
+                # 移除 projects/ 前缀（如果存在）
+                clean_path = file_path
+                if clean_path.startswith('projects/'):
+                    clean_path = clean_path[9:]  # 移除 'projects/' 前缀
+                elif clean_path.startswith('./projects/'):
+                    clean_path = clean_path[11:]  # 移除 './projects/' 前缀
+                
+                # 尝试在项目目录中查找文件
+                full_path = os.path.join(project_data_dir, clean_path.lstrip('./'))
+                if os.path.exists(full_path):
+                    valid_paths.append(full_path)
+                else:
+                    # 如果直接路径不存在，尝试递归查找
+                    found = find_file_in_project(clean_path, project_data_dir)
+                    if found:
+                        valid_paths.append(found)
+            else:
+                if file_path.startswith(project_data_dir) and os.path.exists(file_path):
+                    valid_paths.append(file_path)
+        except Exception as e:
+            continue
+    
+    if not valid_paths:
+        return {
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"Error: Specified files not found in project directory {project_data_dir}"
+                }
+            ]
+        }
+    
+    # 统计匹配结果
+    total_matches = 0
+    total_lines_with_matches = 0
+    file_stats = {}
+    
+    for file_path in valid_paths:
+        try:
+            matches, lines_with_matches = regex_count_in_file(file_path, compiled_pattern, case_sensitive)
+            total_matches += matches
+            total_lines_with_matches += lines_with_matches
+            
+            file_name = os.path.basename(file_path)
+            file_stats[file_name] = {
+                'matches': matches,
+                'lines_with_matches': lines_with_matches
+            }
+        except Exception as e:
+            continue
+    
+    # 格式化输出
+    formatted_lines = []
+    formatted_lines.append("=== Regex Match Statistics ===")
+    formatted_lines.append(f"Pattern: {pattern}")
+    formatted_lines.append(f"Files searched: {len(valid_paths)}")
+    formatted_lines.append(f"Total matches: {total_matches}")
+    formatted_lines.append(f"Total lines with matches: {total_lines_with_matches}")
+    formatted_lines.append("")
+    
+    # 按文件统计
+    formatted_lines.append("=== Statistics by File ===")
+    for file_name, stats in sorted(file_stats.items()):
+        formatted_lines.append(f"File: {file_name}")
+        formatted_lines.append(f"  Matches: {stats['matches']}")
+        formatted_lines.append(f"  Lines with matches: {stats['lines_with_matches']}")
+        formatted_lines.append("")
+    
+    formatted_output = "\n".join(formatted_lines)
+    
+    return {
+        "content": [
+            {
+                "type": "text",
+                "text": formatted_output
+            }
+        ]
+    }
+
+
+def regex_search_in_file(file_path: str, pattern: re.Pattern, 
+                        context_lines: int, case_sensitive: bool) -> List[Dict[str, Any]]:
+    """在单个文件中搜索正则表达式，支持上下文"""
+    results = []
+    
+    try:
+        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+            lines = f.readlines()
+    except Exception as e:
+        return results
+    
+    for line_number, line in enumerate(lines, 1):
+        line_content = line.rstrip('\n\r')
+        
+        # 搜索匹配
+        matches = list(pattern.finditer(line_content))
+        if matches:
+            # 准备上下文
+            context_before = []
+            context_after = []
+            
+            if context_lines > 0:
+                # 获取前面的上下文
+                start_line = max(0, line_number - 1 - context_lines)
+                for i in range(start_line, line_number - 1):
+                    if i < len(lines):
+                        context_before.append({
+                            'line_number': i + 1,
+                            'content': lines[i].rstrip('\n\r')
+                        })
+                
+                # 获取后面的上下文
+                end_line = min(len(lines), line_number + context_lines)
+                for i in range(line_number, end_line):
+                    if i < len(lines):
+                        context_after.append({
+                            'line_number': i + 1,
+                            'content': lines[i].rstrip('\n\r')
+                        })
+            
+            # 为每个匹配创建结果
+            for match in matches:
+                result = {
+                    'file_path': file_path,
+                    'match_line_number': line_number,
+                    'match_text': line_content,
+                    'matched_content': match.group(0),
+                    'start_pos': match.start(),
+                    'end_pos': match.end()
+                }
+                
+                if context_before:
+                    result['context_before'] = context_before
+                
+                if context_after:
+                    result['context_after'] = context_after
+                
+                results.append(result)
+    
+    return results
+
+
+def regex_count_in_file(file_path: str, pattern: re.Pattern, 
+                       case_sensitive: bool) -> tuple[int, int]:
+    """统计文件中的匹配数量"""
+    total_matches = 0
+    lines_with_matches = 0
+    
+    try:
+        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+            lines = f.readlines()
+    except Exception as e:
+        return total_matches, lines_with_matches
+    
+    for line_number, line in enumerate(lines, 1):
+        line_content = line.rstrip('\n\r')
+        
+        # 搜索匹配
+        matches = list(pattern.finditer(line_content))
+        if matches:
+            total_matches += len(matches)
+            lines_with_matches += 1
+    
+    return total_matches, lines_with_matches
+
+
 async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]:
    """Handle MCP request"""
    try:
@ -685,6 +1048,34 @@ async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]:
                    "result": result
                }
            
+            elif tool_name == "regex_grep":
+                pattern = arguments.get("pattern", "")
+                file_paths = arguments.get("file_paths", [])
+                context_lines = arguments.get("context_lines", 0)
+                case_sensitive = arguments.get("case_sensitive", False)
+                limit = arguments.get("limit", 50)
+                
+                result = regex_grep(pattern, file_paths, context_lines, case_sensitive, limit)
+                
+                return {
+                    "jsonrpc": "2.0",
+                    "id": request_id,
+                    "result": result
+                }
+            
+            elif tool_name == "regex_grep_count":
+                pattern = arguments.get("pattern", "")
+                file_paths = arguments.get("file_paths", [])
+                case_sensitive = arguments.get("case_sensitive", False)
+                
+                result = regex_grep_count(pattern, file_paths, case_sensitive)
+                
+                return {
+                    "jsonrpc": "2.0",
+                    "id": request_id,
+                    "result": result
+                }
+            
            else:
                return {
                    "jsonrpc": "2.0",
--- a/mcp/tools/multi_keyword_search_tools.json
+++ b/mcp/tools/multi_keyword_search_tools.json
@ -45,5 +45,75 @@
        "file_paths"
      ]
    }
+  },
+  {
+    "name": "regex_grep",
+    "description": "**Regex Pattern Search**: Search files using regular expressions with context lines support.\n\n**Core Features**:\n- Pure regex pattern matching without weight requirements\n- Context lines support for showing surrounding code\n- Case-sensitive/insensitive search options\n- File grouping in results for better organization\n\n**Parameters**:\n- **pattern**: Regular expression pattern to search for\n- **file_paths**: List of files to search in\n- **context_lines**: Number of lines before and after each match (default: 0)\n- **case_sensitive**: Whether to match case (default: false)\n- **limit**: Maximum number of matches to return (default: 50)\n\n**Use Cases**:\n- Pattern-based code search when you know the exact regex\n- Finding function definitions, class declarations, imports\n- Searching for specific code patterns or structures\n- Context-aware search when you need surrounding lines\n- Debugging and code navigation\n\n**Output Format**:\n- Shows total matches found\n- Groups results by file\n- Displays line numbers with matched content\n- Includes context lines when specified",
+    "inputSchema": {
+      "type": "object",
+      "properties": {
+        "pattern": {
+          "type": "string",
+          "description": "Regular expression pattern to search for"
+        },
+        "file_paths": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "List of file paths to search"
+        },
+        "context_lines": {
+          "type": "integer",
+          "description": "Number of context lines before and after each match",
+          "default": 0,
+          "minimum": 0
+        },
+        "case_sensitive": {
+          "type": "boolean",
+          "description": "Whether to distinguish case sensitivity",
+          "default": false
+        },
+        "limit": {
+          "type": "integer",
+          "description": "Maximum number of matches to return",
+          "default": 50,
+          "minimum": 1
+        }
+      },
+      "required": [
+        "pattern",
+        "file_paths"
+      ]
+    }
+  },
+  {
+    "name": "regex_grep_count",
+    "description": "**Regex Match Statistics**: Count regex pattern matches across files without returning actual content.\n\n**Core Features**:\n- Pure regex pattern counting without weight requirements\n- Comprehensive match statistics per file\n- Total match and line counts\n- Case-sensitive/insensitive search options\n\n**Parameters**:\n- **pattern**: Regular expression pattern to search for\n- **file_paths**: List of files to search in\n- **case_sensitive**: Whether to match case (default: false)\n\n**Use Cases**:\n- Quick assessment of pattern prevalence across codebase\n- Counting occurrences of specific functions, variables, or patterns\n- Measuring code complexity or usage statistics\n- Pre-search analysis to understand scope\n- Quality metrics and code analysis\n\n**Output Format**:\n- Summary statistics with total matches and files searched\n- Per-file breakdown with match counts and lines affected\n- Clear formatting for easy analysis and reporting",
+    "inputSchema": {
+      "type": "object",
+      "properties": {
+        "pattern": {
+          "type": "string",
+          "description": "Regular expression pattern to search for"
+        },
+        "file_paths": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "List of file paths to search"
+        },
+        "case_sensitive": {
+          "type": "boolean",
+          "description": "Whether to distinguish case sensitivity",
+          "default": false
+        }
+      },
+      "required": [
+        "pattern",
+        "file_paths"
+      ]
+    }
  }
 ]
--- a/prompt/system_prompt_default.md
+++ b/prompt/system_prompt_default.md
@ -9,6 +9,7 @@
 - 纯文本文档（document.txt）
  - 原始markdown文本内容，可提供数据的完整上下文信息，内容检索困难。
  - 获取检索某一行数据的时候，需要包含行的前后10行的上下文才有意义，单行内容简短且没有意义。
+  - 请在必要的时候使用`multi_keyword-regex_grep`工具，带contextLines 参数来调阅document.txt上下文文件。
 - 分页数据层 (pagination.txt)：
  - 单行内容代表完整的一页数据，无需读取前后行的上下文, 前后行的数据对应上下页的内容，适合一次获取全部资料的场景。
  - 正则和关键词的主要检索文件, 请先基于这个文件检索到关键信息再去调阅document.txt
@ -33,9 +34,13 @@
 ### 问题分析
 1. **问题分析**：分析问题，整理出可能涉及检索的关键词，为下一步做准备
 2. **关键词提取**：构思并生成需要检索的核心关键词。下一步需要基于这些关键词进行关键词扩展操作。
-3. **数字关键词扩展**：
+3. **数据预览**：对于价格、重量、长度等存在数字的内容，可以多次调用`multi_keyword-regex_grep`对`document.txt`的内容进行数据模式预览，这样返回的数据量少，为下一步的关键词扩展提供数据支撑。
+
+### 关键词扩展
+4. **关键词扩展**：基于召回的内容扩展和优化需要检索的关键词，需要尽量丰富的关键词这对多关键词检索很重要。
+5. **数字扩展**：
  a. **单位标准化扩展**：
-     - 重量：1千克 → 1000g, 1kg, 1.0kg, 1000.0g, 1公斤
+     - 重量：1千克 → 1000g, 1kg, 1.0kg, 1000.0g, 1公斤，0.99kg
     - 长度：3米 → 3m, 3.0m, 30cm, 300厘米
     - 货币：¥9.99 → 9.99元, 9.99元, ¥9.99, 九点九九元
     - 时间：2小时 → 120分钟, 7200秒, 2h, 2.0小时, 两小时
@ -51,34 +56,97 @@
     - 百分比：25% → 0.25, 百分之二十五
     - 时间：7天 → 7日, 一周, 168小时

-  d. **范围性扩展**（适度）：
-     - 重量：1kg → 900g, 990g, 0.99kg, 1200kg，
-     - 长度：3 meters → 2.8m, 3.5m, 28cm, 290 centimeters.
-     - 价格：100元 → 90元, 95元, 105元, 110元
-     - 时间：7天 → 5天, 6天, 8天, 10天
+  d. **范围性扩展（适度）**: 从自然语言的语义中理解其表达的数量范围，然后将这个范围转化为可匹配文本模式的正则表达式。
+   ** 1. 重量**
+   -   **案例1：模糊精确值**
+       -   **语义**：`大约1kg/1000g左右`
+       -   **范围理解**：允许一个上下浮动的区间，例如 ±20%，即 800g 到 1200g。
+       -   **正则表达式**：`/([01]\.\d+\s*[kK]?[gG]|(8\d{2}|9\d{2}|1[01]\d{2}|1200)\s*[gG])/`
+           -   **解释**：
+               - `[01]\.\d+\s*[kK]?[gG]`：匹配 `0.8` 到 `1.2` 之间的千克数（如 `0.95 kg`, `1.2kg`）。
+               - `(8\d{2}|9\d{2}|1[01]\d{2}|1200)\s*[gG]`：匹配 `800` 到 `1200` 之间的克数。

-  e. **正则范围扩展**（重要）：
-     - 根据上文扩展的数字关键词，生成范围检索的正则表达式，检索效果更好。
-     - 重量：1kg/1000g/800g-1200g →  /[01].\d+\s*kg|(8\d{2}|9\d{2}|1[01]\d{2}|1200)\s*g/
-     - 长度：3m/3.0m →  /3\s*m|3.\d+\s*m/
-### 关键词扩展
-4. **数据预览**：
-   － **数字内容正则检索**：对于价格、重量、长度等存在数字的内容，推荐优先调用`multi_keyword-search` 对`document.txt`的内容进行数据预览，这样返回的数据量少，为下一步的关键词扩展提供数据支撑。
-5. **关键词扩展**：基于召回的内容扩展和优化需要检索的关键词，需要尽量丰富的关键词这对多关键词检索很重要。
+   -   **案例2：上限值**
+       -   **语义**：`小于1kg`
+       -   **范围理解**：从很小的值（如1g）到接近1kg的值（如800g或999g），不包括1kg本身。
+       -   **正则表达式**：`/\b([1-9]\d{0,2}\s*[gG]|0?\.\d+\s*[kK]?[gG])\b/`
+           -   **解释**：
+               - `[1-9]\d{0,2}\s*[gG]`：匹配 1-999 克。
+               - `0?\.\d+\s*[kK]?[gG]`：匹配 0.1kg, .5kg 等小于1的千克数。
+
+   ** 2. 长度**
+   -   **案例1：近似值**
+       -   **语义**：`3米`
+       -   **范围理解**：可能表示一个近似值，范围在 2.5米 到 3.5米 之间。
+       -   **正则表达式**：`/\b([2-3]\.\d+\s*[mM]|2\.5|3\.5)\b/`
+           -   **解释**：匹配 `2.5` 到 `3.5` 之间的米数。
+
+   -   **案例2：上限值**
+       -   **语义**：`小于3米`
+       -   **范围理解**：从很小的值（如0.1m）到接近3米的值（如2.9m）。
+       -   **正则表达式**：`/\b([0-2]\.\d+\s*[mM]|[12]?\d{1,2}\s*[cC][mM])\b/`
+           -   **解释**：
+               - `[0-2]\.\d+\s*[mM]`：匹配 0.0 到 2.9 米。
+               - `[12]?\d{1,2}\s*[cC][mM]`：同时匹配可能用厘米表示的情况，如 50cm, 150cm, 299cm。
+
+   ** 3. 价格**
+   -   **案例1：基准价格**
+       -   **语义**：`100元`
+       -   **范围理解**：可能是一个参考价，上下浮动10元，即90元到110元。
+       -   **正则表达式**：`/\b(9[0-9]|10[0-9]|110)\s*元?\b/`
+           -   **解释**：匹配 `90` 到 `110` 之间的整数，后面跟着“元”字。
+
+   -   **案例2：价格区间**
+       -   **语义**：`100到200元之间`
+       -   **范围理解**：明确的价格区间。
+       -   **正则表达式**：`/\b(1[0-9]{2})\s*元?\b/`
+           -   **解释**：匹配 `100` 到 `199` 之间的整数。如果需要更精确到200，可写为 `(1[0-9]{2}|200)`。
+
+   ** 4. 时间**
+   -   **案例1：近似时长**
+       -   **语义**：`7天`
+       -   **范围理解**：可能前后浮动几天，例如5到10天。
+       -   **正则表达式**：`/\b([5-9]|10)\s*天?\b/`
+           -   **解释**：匹配 `5`, `6`, `7`, `8`, `9`, `10` 这些数字加上“天”字。
+
+   -   **案例2：超过某个时间**
+       -   **语义**：`大于一周`
+       -   **范围理解**：8天及以上，或者8天到一个月（30天）。
+       -   **正则表达式**：`/\b([8-9]|[12][0-9]|30)\s*天?\b/`
+           -   **解释**：匹配 `8` 到 `30` 天。
+
+   ** 5. 温度**
+   -   **案例1：舒适温度**
+       -   **语义**：`室温（约25摄氏度）`
+       -   **范围理解**：通常指20°C到30°C。
+       -   **正则表达式**：`/\b(2[0-9]|30)\s*°?[Cc]\b/`
+           -   **解释**：匹配 `20` 到 `30` 之间的整数，后跟 `C` 或 `°C`。
+
+   -   **案例2：高温**
+       -   **语义**：`零度以下`
+       -   **范围理解**：任何小于0°C的温度。
+       -   **正则表达式**：`/\b-?[1-9]\d*\s*°?[Cc]\b/`
+           -   **注意**：这个正则较简单，实际应用需考虑负数匹配的精确性。
+
+   ** 6. 百分比**
+   -   **案例1：高浓度**
+       -   **语义**：`浓度很高（超过90%）`
+       -   **范围理解**：90% 到 100%。
+       -   **正则表达式**：`/\b(9[0-9]|100)\s*%?\b/`
+           -   **解释**：匹配 `90` 到 `100` 之间的整数，后跟可选的 `%` 符号。
+
+   -   **案例2：半数以上**
+       -   **语义**：`大部分`
+       -   **范围理解**：可以理解为 50% 到 90%。
+       -   **正则表达式**：`/\b([5-8][0-9]|90)\s*%?\b/`
+           -   **解释**：匹配 `50` 到 `90` 之间的整数。

 ### 策略制定
 6. **路径选择**：根据查询复杂度选择最优搜索路径
   - **策略原则**：优先简单字段匹配，避免复杂正则表达式
   - **优化思路**：使用宽松匹配 + 后处理筛选，提高召回率
-
-
-### 执行与验证
-7. **搜索执行**：必须使用`multi_keyword-search`执行全面的多关键词+正则混合检索，没有执行这个步骤不要给出最终的答案。
-8. **交叉验证**：使用关键词在`document.txt`文件执行上下文查询获取前后20行内容进行参考。
-   - 通过多角度搜索确保结果完整性
-   - 使用不同关键词组合
-   - 尝试多种查询模式
-   - 在不同数据层间验证
+7. **规模预估**：调用`multi_keyword-regex_grep_count`评估搜索结果规模，避免数据过载
+8. **搜索执行**：给出最终回答之前，必须使用`multi_keyword-search`执行多关键词权重的混合检索。

 ## 高级搜索策略

--- a/start_all.sh
+++ b/start_all.sh
@ -13,9 +13,9 @@ mkdir -p /app/queue_data
 # 等待一下确保目录创建完成
 sleep 1

-echo "Starting FastAPI application..."
+echo "Starting FastAPI application with uvicorn..."
 # 在后台启动FastAPI应用
-python fastapi_app.py &
+uvicorn fastapi_app:app --host 0.0.0.0 --port 8000 &

 echo "Starting queue consumer..."
 # 在后台启动队列消费者