qwen_agent/mcp/excel_csv_operator_server.py

#!/usr/bin/env python3
"""
Excel and CSV file operation MCP server
Support reading, searching, and enum-value retrieval for Excel/CSV files
Reference the implementation style of multi_keyword_search_server.py
"""

import json
import os
import sys
import asyncio
import re
import chardet
from typing import Any, Dict, List, Optional, Union
import pandas as pd
from mcp_common import (
    get_allowed_directory,
    load_tools_from_json,
    resolve_file_path,
    find_file_in_project,
    is_regex_pattern,
    compile_pattern,
    create_error_response,
    create_success_response,
    create_initialize_response,
    create_ping_response,
    create_tools_list_response,
    handle_mcp_streaming
)


def detect_encoding(file_path: str) -> str:
    """Detect file encoding"""
    try:
        with open(file_path, 'rb') as f:
            raw_data = f.read(10000)  # Read the first 10 KB to detect encoding
            result = chardet.detect(raw_data)
            return result['encoding'] or 'utf-8'
    except:
        return 'utf-8'


class ExcelCSVOperator:
    """Core class for Excel and CSV file operations"""

    def __init__(self):
        self.supported_extensions = ['.xlsx', '.xls', '.csv']
        self.encoding_cache = {}

    def _validate_file(self, file_path: str) -> str:
        """Validate and process the file path"""
        # Resolve file paths, supporting folder/document.txt and document.txt formats
        resolved_path = resolve_file_path(file_path)

        # Validate the file extension
        file_ext = os.path.splitext(resolved_path)[1].lower()
        if file_ext not in self.supported_extensions:
            raise ValueError(f"Unsupported file format: {file_ext}, supported formats: {self.supported_extensions}")

        return resolved_path

    def load_data(self, file_path: str, sheet_name: str = None) -> pd.DataFrame:
        """Load data from an Excel or CSV file"""
        file_path = self._validate_file(file_path)
        file_ext = os.path.splitext(file_path)[1].lower()

        try:
            if file_ext == '.csv':
                encoding = detect_encoding(file_path)
                df = pd.read_csv(file_path, encoding=encoding)
            else:
                # Excel file
                if sheet_name:
                    df = pd.read_excel(file_path, sheet_name=sheet_name)
                else:
                    # Read the first sheet
                    df = pd.read_excel(file_path)

            # Handle empty values
            df = df.fillna('')

            return df
        except Exception as e:
            raise ValueError(f"File loading failed: {str(e)}")

    def get_sheets(self, file_path: str) -> List[str]:
        """Get all sheet names from an Excel file"""
        file_path = self._validate_file(file_path)
        file_ext = os.path.splitext(file_path)[1].lower()

        if file_ext == '.csv':
            return ['default']  # A CSV file has only one default sheet

        try:
            excel_file = pd.ExcelFile(file_path)
            return excel_file.sheet_names
        except Exception as e:
            raise ValueError(f"Failed to read Excel sheet list: {str(e)}")

    def get_schema(self, file_path: str, sheet_name: str = None) -> List[str]:
        """Get the list of schema fields for the file"""
        try:
            df = self.load_data(file_path, sheet_name)
            return df.columns.tolist()
        except Exception as e:
            raise ValueError(f"Failed to get schema: {str(e)}")

    def full_text_search(self, file_path: str, keywords: List[str],
                        top_k: int = 10, case_sensitive: bool = False) -> str:
        """Full-text search"""
        if not keywords:
            return "Error: Keyword list cannot be empty"

        # Preprocess and validate regex expressions in keywords
        valid_keywords = []
        regex_errors = []

        for keyword in keywords:
            compiled = compile_pattern(keyword)
            if compiled is None:
                regex_errors.append(keyword)
            else:
                valid_keywords.append(keyword)

        if regex_errors:
            error_msg = f"Warning: The following regular expressions failed to compile and will be ignored: {', '.join(regex_errors)}"
            print(error_msg)

        if not valid_keywords:
            return "Error: No valid search keywords"

        try:
            # Validate the file path
            validated_path = self._validate_file(file_path)
            file_ext = os.path.splitext(validated_path)[1].lower()

            all_results = []

            if file_ext == '.csv':
                # A CSV file has only one dataset
                results = self._search_in_file(validated_path, valid_keywords, case_sensitive, 'default')
                all_results.extend(results)
            else:
                # For Excel files, search all sheets
                sheets = self.get_sheets(validated_path)
                for sheet in sheets:
                    results = self._search_in_file(validated_path, valid_keywords, case_sensitive, sheet)
                    all_results.extend(results)

            # Sort by match count in descending order
            all_results.sort(key=lambda x: x['match_count'], reverse=True)

            # Limit the number of results
            limited_results = all_results[:top_k]

            # Format output as CSV
            if not limited_results:
                return "No matching results found"

            # Build CSV-formatted output
            csv_lines = []
            headers = ["sheet", "row_index", "match_count", "matched_content", "match_details"]
            csv_lines.append(",".join(headers))

            for result in limited_results:
                # Escape special characters in CSV
                sheet = str(result.get('sheet', '')).replace(',', '，')
                row_index = str(result.get('row_index', ''))
                match_count = str(result.get('match_count', 0))
                matched_content = str(result.get('matched_content', '')).replace(',', '，').replace('\n', ' ')
                match_details = str(result.get('match_details', '')).replace(',', '，')

                csv_lines.append(f"{sheet},{row_index},{match_count},{matched_content},{match_details}")

            return "\n".join(csv_lines)

        except Exception as e:
            return f"Search failed: {str(e)}"

    def _search_in_file(self, file_path: str, keywords: List[str],
                      case_sensitive: bool, sheet_name: str = None) -> List[Dict[str, Any]]:
        """Search for keywords in a file"""
        results = []

        try:
            df = self.load_data(file_path, sheet_name)

            # Preprocess all patterns
            processed_patterns = []
            for keyword in keywords:
                compiled = compile_pattern(keyword)
                if compiled is not None:
                    processed_patterns.append({
                        'original': keyword,
                        'pattern': compiled,
                        'is_regex': isinstance(compiled, re.Pattern)
                    })

            # Search row by row
            for row_index, row in df.iterrows():
                # Combine the full row into a string for searching
                row_content = " ".join([str(cell) for cell in row.values if str(cell).strip()])
                search_content = row_content if case_sensitive else row_content.lower()

                # Count matched patterns
                matched_patterns = []
                for pattern_info in processed_patterns:
                    pattern = pattern_info['pattern']
                    is_regex = pattern_info['is_regex']

                    match_found = False
                    match_details = None

                    if is_regex:
                        # Regex match
                        if case_sensitive:
                            match = pattern.search(row_content)
                        else:
                            # For case-insensitive regex matching, recompile the pattern
                            if isinstance(pattern, re.Pattern):
                                flags = pattern.flags | re.IGNORECASE
                                case_insensitive_pattern = re.compile(pattern.pattern, flags)
                                match = case_insensitive_pattern.search(row_content)
                            else:
                                match = pattern.search(search_content)

                        if match:
                            match_found = True
                            match_details = match.group(0)
                    else:
                        # Plain string match
                        search_keyword = pattern if case_sensitive else pattern.lower()
                        if search_keyword in search_content:
                            match_found = True
                            match_details = pattern

                    if match_found:
                        matched_patterns.append({
                            'original': pattern_info['original'],
                            'type': 'regex' if is_regex else 'keyword',
                            'match': match_details
                        })

                match_count = len(matched_patterns)

                if match_count > 0:
                    # Build match details
                    match_details = []
                    for pattern in matched_patterns:
                        if pattern['type'] == 'regex':
                            match_details.append(f"[regex:{pattern['original']}={pattern['match']}]")
                        else:
                            match_details.append(f"[keyword:{pattern['match']}]")

                    match_info = " ".join(match_details)

                    results.append({
                        'sheet': sheet_name,
                        'row_index': row_index,
                        'match_count': match_count,
                        'matched_content': row_content,
                        'match_details': match_info
                    })

        except Exception as e:
            print(f"Error searching file {file_path} (sheet: {sheet_name}): {str(e)}")

        return results

    def filter_search(self, file_path: str, filters: Dict,
                     sheet_name: str = None) -> str:
        """Field-based filter search"""
        if not filters:
            return "Error: Filter conditions cannot be empty"

        try:
            df = self.load_data(file_path, sheet_name)

            # Apply filter conditions
            filtered_df = df.copy()

            for field_name, filter_condition in filters.items():
                if field_name not in df.columns:
                    return f"Error: Field '{field_name}' does not exist"

                operator = filter_condition.get('operator', 'eq')
                value = filter_condition.get('value')

                if operator == 'eq':
                    # Equal to
                    filtered_df = filtered_df[filtered_df[field_name] == value]
                elif operator == 'gt':
                    # Greater than
                    filtered_df = filtered_df[filtered_df[field_name] > value]
                elif operator == 'lt':
                    # Less than
                    filtered_df = filtered_df[filtered_df[field_name] < value]
                elif operator == 'gte':
                    # Greater than or equal to
                    filtered_df = filtered_df[filtered_df[field_name] >= value]
                elif operator == 'lte':
                    # Less than or equal to
                    filtered_df = filtered_df[filtered_df[field_name] <= value]
                elif operator == 'contains':
                    # Contains
                    filtered_df = filtered_df[filtered_df[field_name].astype(str).str.contains(str(value), na=False)]
                elif operator == 'regex':
                    # Regular expression
                    try:
                        pattern = re.compile(str(value))
                        filtered_df = filtered_df[filtered_df[field_name].astype(str).str.match(pattern, na=False)]
                    except re.error as e:
                        return f"Error: Regular expression '{value}' compilation failed: {str(e)}"
                else:
                    return f"Error: Unsupported operator '{operator}'"

            # Format output as CSV
            if filtered_df.empty:
                return "No records matching conditions found"

            # Convert to a CSV string
            csv_result = filtered_df.to_csv(index=False, encoding='utf-8')
            return csv_result

        except Exception as e:
            return f"Filter search failed: {str(e)}"

    def get_field_enums(self, file_path: str, field_names: List[str],
                       sheet_name: str = None, max_enum_count: int = 100,
                       min_occurrence: int = 1) -> str:
        """Get enum values for the specified fields"""
        if not field_names:
            return "Error: Field name list cannot be empty"

        try:
            df = self.load_data(file_path, sheet_name)

            # Validate that fields exist
            missing_fields = [field for field in field_names if field not in df.columns]
            if missing_fields:
                return f"Error: Fields do not exist: {', '.join(missing_fields)}"

            # Compute enum values for each field
            enum_results = {}

            for field in field_names:
                # Count occurrences of each value
                value_counts = df[field].value_counts()

                # Filter out values with too few occurrences
                filtered_counts = value_counts[value_counts >= min_occurrence]

                # Limit the number of returned values
                top_values = filtered_counts.head(max_enum_count)

                # Format results
                enum_values = []
                for value, count in top_values.items():
                    enum_values.append(f"{value}({count})")

                enum_results[field] = {
                    'enum_values': enum_values,
                    'total_unique': len(value_counts),
                    'total_filtered': len(filtered_counts),
                    'total_rows': len(df)
                }

            # Format output
            output_lines = []
            for field, data in enum_results.items():
                enum_str = ", ".join(data['enum_values'])
                field_info = f"{field}: [{enum_str}] (total: {data['total_unique']} unique values, after filtering: {data['total_filtered']}, total rows: {data['total_rows']})"
                output_lines.append(field_info)

            return "\n".join(output_lines)

        except Exception as e:
            return f"Failed to get enum values: {str(e)}"


# Global operator instance
operator = ExcelCSVOperator()


async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]:
    """Handle MCP request"""
    try:
        method = request.get("method")
        params = request.get("params", {})
        request_id = request.get("id")

        if method == "initialize":
            return create_initialize_response(request_id, "excel-csv-operator")

        elif method == "ping":
            return create_ping_response(request_id)

        elif method == "tools/list":
            # Load tool definitions from the JSON file
            tools = load_tools_from_json("excel_csv_operator_tools.json")
            return create_tools_list_response(request_id, tools)

        elif method == "tools/call":
            tool_name = params.get("name")
            arguments = params.get("arguments", {})

            if tool_name == "get_excel_sheets":
                file_path = arguments.get("file_path")
                result = operator.get_sheets(file_path)

                return create_success_response(request_id, {
                    "content": [
                        {
                            "type": "text",
                            "text": json.dumps(result, ensure_ascii=False, indent=2)
                        }
                    ]
                })

            elif tool_name == "get_table_schema":
                file_path = arguments.get("file_path")
                sheet_name = arguments.get("sheet_name")
                result = operator.get_schema(file_path, sheet_name)

                return create_success_response(request_id, {
                    "content": [
                        {
                            "type": "text",
                            "text": json.dumps(result, ensure_ascii=False, indent=2)
                        }
                    ]
                })

            elif tool_name == "full_text_search":
                file_path = arguments.get("file_path")
                keywords = arguments.get("keywords", [])
                top_k = arguments.get("top_k", 10)
                case_sensitive = arguments.get("case_sensitive", False)

                result = operator.full_text_search(file_path, keywords, top_k, case_sensitive)

                return create_success_response(request_id, {
                    "content": [
                        {
                            "type": "text",
                            "text": result
                        }
                    ]
                })

            elif tool_name == "filter_search":
                file_path = arguments.get("file_path")
                sheet_name = arguments.get("sheet_name")
                filters = arguments.get("filters")

                result = operator.filter_search(file_path, filters, sheet_name)

                return create_success_response(request_id, {
                    "content": [
                        {
                            "type": "text",
                            "text": result
                        }
                    ]
                })

            elif tool_name == "get_field_enums":
                file_path = arguments.get("file_path")
                sheet_name = arguments.get("sheet_name")
                field_names = arguments.get("field_names", [])
                max_enum_count = arguments.get("max_enum_count", 100)
                min_occurrence = arguments.get("min_occurrence", 1)

                result = operator.get_field_enums(file_path, field_names, sheet_name, max_enum_count, min_occurrence)

                return create_success_response(request_id, {
                    "content": [
                        {
                            "type": "text",
                            "text": result
                        }
                    ]
                })

            else:
                return create_error_response(request_id, -32601, f"Unknown tool: {tool_name}")

        else:
            return create_error_response(request_id, -32601, f"Unknown method: {method}")

    except Exception as e:
        return create_error_response(request.get("id"), -32603, f"Internal error: {str(e)}")


async def main():
    """Main entry point."""
    await handle_mcp_streaming(handle_request)


if __name__ == "__main__":
    asyncio.run(main())